因为需要语音转文字,就找了一些工具,发现要么不好用,要么付费,突然想起来 iOS 原生不就支持么,而且 iphone 自带 AI 芯片,就看看 iOS 的 SpeechKit 框架,发现已经高度封装了,使用起来特别方便,而且,最主要的是,这个框架非常轻量。
从 Speech.h 可以看到:
#import
#import
#import
#import
#import
#import
#import
#import
#import
就这么几个文件。
// 一些声学特性,ios 13 / macos 10.15 以后可用
API_AVAILABLE(ios(13), macos(10.15))
@interface SFAcousticFeature : NSObject
// 音频片段中的每一个音频帧的特性值
@property (nonatomic, readonly, copy) NSArray *acousticFeatureValuePerFrame;
// 音频帧时长
@property (nonatomic, readonly) NSTimeInterval frameDuration;
@end
// 与录制的音频片段相对应的语音分析
API_AVAILABLE(ios(13), macos(10.15))
@interface SFVoiceAnalytics : NSObject
// 抖动,用于衡量稳定性,百分比表示
@property (nonatomic, readonly, copy) SFAcousticFeature *jitter;
// Shimmer测量声音的稳定性,以分贝为单位
@property (nonatomic, readonly, copy) SFAcousticFeature *shimmer;
// 音高测量音调的高低,并以标准化音高估计值的对数来测量
@property (nonatomic, readonly, copy) SFAcousticFeature *pitch;
// 发声测量帧是否发声的概率,并作为概率进行测量
@property (nonatomic, readonly, copy) SFAcousticFeature *voicing;
@end
API_AVAILABLE(ios(14.5), macos(11.3))
@interface SFSpeechRecognitionMetadata : NSObject
// 每分钟的讲话速度
@property (nonatomic, readonly) double speakingRate;
// 两个词语之间的空隙时长,单位s
@property (nonatomic, readonly) NSTimeInterval averagePauseDuration;
// 开始的时间戳
@property (nonatomic, readonly) NSTimeInterval speechStartTimestamp;
// 时长
@property (nonatomic, readonly) NSTimeInterval speechDuration;
@property (nonatomic, nullable, readonly) SFVoiceAnalytics *voiceAnalytics;
@end
API_AVAILABLE(ios(10.0), macos(10.15))
@interface SFSpeechRecognitionResult : NSObject
@property (nonatomic, readonly, copy) SFTranscription *bestTranscription;
// 识别结果,按识别率排序
@property (nonatomic, readonly, copy) NSArray *transcriptions;
// 识别是否结束
@property (nonatomic, readonly, getter=isFinal) BOOL final;
@property (nonatomic, nullable, readonly) SFSpeechRecognitionMetadata *speechRecognitionMetadata API_AVAILABLE(ios(14.0), macos(11.0));
@end
// 语音识别请求
API_AVAILABLE(ios(10.0), macos(10.15))
@interface SFSpeechRecognitionRequest : NSObject
// 请求分类
@property (nonatomic) SFSpeechRecognitionTaskHint taskHint;
// If true, partial (non-final) results for each utterance will be reported.
// Default is true
@property (nonatomic) BOOL shouldReportPartialResults;
// Phrases which should be recognized even if they are not in the system vocabulary
@property (nonatomic, copy) NSArray *contextualStrings;
// 是否在本地识别,默认 false
@property (nonatomic) BOOL requiresOnDeviceRecognition API_AVAILABLE(ios(13), macos(10.15));
@end
// 音频文件的识别请求
API_AVAILABLE(ios(10.0), macos(10.15))
@interface SFSpeechURLRecognitionRequest : SFSpeechRecognitionRequest
- (instancetype)init NS_UNAVAILABLE;
// Request to transcribe speech from an audio file from the given URL.
- (instancetype)initWithURL:(NSURL *)URL NS_DESIGNATED_INITIALIZER;
@property (nonatomic, readonly, copy) NSURL *URL;
@end
// 音频buffer的识别请求
API_AVAILABLE(ios(10.0), macos(10.15))
@interface SFSpeechAudioBufferRecognitionRequest : SFSpeechRecognitionRequest
// Preferred audio format for optimal speech recognition
@property (nonatomic, readonly) AVAudioFormat *nativeAudioFormat;
// Append audio to the end of the recognition stream. Must currently be in native format.
- (void)appendAudioPCMBuffer:(AVAudioPCMBuffer *)audioPCMBuffer;
- (void)appendAudioSampleBuffer:(CMSampleBufferRef)sampleBuffer;
// Indicate that the audio source is finished and no more audio will be appended
- (void)endAudio;
@end
语音识别请求的task,比较重要的是 SFSpeechRecognitionTaskDelegate。
只有一个emum, 比较简单。
// Hints on kind of speech recognition being performed
typedef NS_ENUM(NSInteger, SFSpeechRecognitionTaskHint) {
SFSpeechRecognitionTaskHintUnspecified = 0, // Unspecified recognition
SFSpeechRecognitionTaskHintDictation = 1, // General dictation/keyboard-style
SFSpeechRecognitionTaskHintSearch = 2, // Search-style requests
SFSpeechRecognitionTaskHintConfirmation = 3, // Short, confirmation-style requests ("Yes", "No", "Maybe")
} API_AVAILABLE(ios(10.0), macos(10.15));
最主要的类,用于申请识别权限,创建识别任务。
Original: https://blog.csdn.net/woshizshu/article/details/124221468
Author: woshizshu
Title: iOS SpeechKit API 解读