第一句子网 > python语音识别音频文件的方法

python语音识别音频文件的方法

时间：2023-12-26 06:17:01

相关推荐

python语音识别音频文件的方法

问题描述

语音识别问题首先可以通过深度学习方法，训练语言模型后进行音频识别，但该方法要求设备内存足够大，训练时间通常较久，后期有时间的话会再学习使用；

另外就是离线识别方法——使用python自带语音包，对于简单音频的识别率还可以；

联网识别方法——调用各大已经实现语音识别功能网站的接口，目前人家做得已经很成熟了，比较常见的有百度、讯飞，这里使用的标贝科技也不错，具体参数可以去官网查询使用。

查阅很多资料，有些代码是不能用的，这里就直接给出测试过的代码了，都是python语言，别的知识自己去查吧

SpeechRecognition

支持音频文件类型：
WAV: 必须是 PCM/LPCM 格式AIFFAIFF-CFLAC: 必须是初始 FLAC 格式；OGG-FLAC 格式不可用

# 终端安装pip3 install SpeechRecognitionpip3 install pocketsphinx# 若要访问麦克风则必须安装 PyAudio 软件包pip3 install PyAudio# 以下识别中文需要添加中文语音包，可以去查询获取

# 离线识别# -*- coding: utf-8 -*-import speech_recognition as srfrom os import pathaudio_file = path.join(path.dirname(path.realpath('C:/Users/263000/Desktop/')), 'C:/Users/263000/Desktop/test.mp3')r = sr.Recognizer()with sr.AudioFile(audio_file) as source:audio = r.record(source)r = sr.Recognizer()try:print(" 音频内容为： " + r.recognize_sphinx(audio, language='zh-CN'))except sr.UnknownValueError:print('Sphinx could not understand audio')except sr.RequestError as e:print('Sphinx error; {0}'.format(e))# print('文本内容: ', r.recognize_sphinx(audio,language='zh-CN')) #汉语# print('文本内容: ', r.recognize_sphinx(audio)) # 英语# 在线识别# -*- coding: utf-8 -*-import speech_recognition as sr# obtain audio from the microphoner = sr.Recognizer()with sr.Microphone() as source:r.adjust_for_ambient_noise(source) #收听1秒，以校准环境噪声级的能量阈值print('say something')# print("")audio = r.listen(source)## # recognize speech using Sphinxtry:print("Sphinx thinks you said " + r.recognize_sphinx(audio))except sr.UnknownValueError:print("Sphinx could not understand audio")except sr.RequestError as e:print("Sphinx error; {0}".format(e))# 一些改进# offset设置命令起点， duration设置持续时间audio = r.record(source, offset=4.7, duration=2.8) # 使用adjust_for_ambient_noise（）命令减少噪音r.adjust_for_ambient_noise(source)audio = r.record(source)

百度

只支持 pcm/wav/amr 格式，采样率为固定值16000

# coding=utf-8import jsonimport timefrom urllib.request import urlopenfrom urllib.request import Requestfrom urllib.error import URLErrorfrom urllib.parse import urlencodetimer = time.timeAPI_KEY = '你自己的'SECRET_KEY = '你自己的'# 需要识别的文件AUDIO_FILE = './test.wav' # 只支持 pcm/wav/amr 格式，极速版额外支持m4a 格式# 文件格式FORMAT = AUDIO_FILE[-3:] # 文件后缀只支持 pcm/wav/amr 格式，极速版额外支持m4a 格式CUID = '123456PYTHON'# 采样率RATE = 16000 # 固定值# 普通版DEV_PID = 1537 # 1537 表示识别普通话，使用输入法模型。根据文档填写PID，选择语言及识别模型ASR_URL = '/server_api'SCOPE = 'audio_voice_assistant_get' # 有此scope表示有asr能力，没有请在网页里勾选，非常旧的应用可能没有""" TOKEN start """TOKEN_URL = '/oauth/2.0/token'def fetch_token():params = {'grant_type': 'client_credentials','client_id': API_KEY,'client_secret': SECRET_KEY}post_data = urlencode(params)post_data = post_data.encode('utf-8')req = Request(TOKEN_URL, post_data)try:f = urlopen(req)result_str = f.read()except URLError as err:print('token http response http code : ' + str(err.code))result_str = err.read()result_str = result_str.decode()result = json.loads(result_str)if 'access_token' in result.keys() and 'scope' in result.keys():if SCOPE and (SCOPE not in result['scope'].split(' ')): # SCOPE = False 忽略检查raise DemoError('scope is not correct')return result['access_token']else:raise DemoError('MAYBE API_KEY or SECRET_KEY not correct: access_token or scope not found in token response')""" TOKEN end """if __name__ == '__main__':token = fetch_token()speech_data = []with open(AUDIO_FILE, 'rb') as speech_file:speech_data = speech_file.read()length = len(speech_data)if length == 0:raise DemoError('file %s length read 0 bytes' % AUDIO_FILE)params = {'cuid': CUID, 'token': token, 'dev_pid': DEV_PID}params_query = urlencode(params)headers = {'Content-Type': 'audio/' + FORMAT + '; rate=' + str(RATE),'Content-Length': length}# print post_datareq = Request(ASR_URL + "?" + params_query, speech_data, headers)try:begin = timer()f = urlopen(req)result_str = f.read()print("Request time cost %f" % (timer() - begin))except URLError as err:print('asr http response http code : ' + str(err.code))result_str = err.read()result_str = str(result_str, 'utf-8')print(result_str)with open("result.txt", "w") as of:of.write(result_str)

标贝

与百度一样，需要先去官网认证登录，创建语音识别的应用，获取自己的API和SECRET，每天只有有限次数的免费使用额度

#!/usr/bin/env python# coding: utf-8import requestsimport jsonimport argparse# 获取access_token用于鉴权def get_access_token(client_secret, client_id):grant_type = "client_credentials"url = "https://openapi.data-/oauth/2.0/token?grant_type={}&client_secret={}&client_id={}" \.format(grant_type, client_secret, client_id)try:response = requests.post(url)response.raise_for_status()except Exception as e:print(e)returnelse:access_token = json.loads(response.text).get('access_token')return access_token# 获取识别后文本def get_text(file, headers):url = "https://asr.data-/asr/api?"response = requests.post(url, data=file, headers=headers)code = json.loads(response.text).get("code")text = json.loads(response.text).get("text")if code != 20000:print(response.text)return text# 获取命令行输入参数def get_args():parser = argparse.ArgumentParser(description='ASR')parser.add_argument('-client_secret', type=str, required=True)parser.add_argument('-client_id', type=str, required=True)parser.add_argument('-file_path', type=str, required=True)parser.add_argument('--audio_format', type=str, default='wav')parser.add_argument('--sample_rate', type=str, default='16000')parser.add_argument('--add_pct', type=str, default='true')args = parser.parse_args()return argsif __name__ == '__main__':args = get_args()# 获取access_tokenclient_secret = args.client_secretclient_id = args.client_idaccess_token = get_access_token(client_secret, client_id)# 读取音频文件with open(args.file_path, 'rb') as f:file = f.read()# 填写Header信息audio_format = args.audio_formatsample_rate = args.sample_rateadd_pct = args.add_pctheaders = {'access_token': access_token, 'audio_format': audio_format, 'sample_rate': sample_rate,'add_pct': add_pct}text = get_text(file, headers)print(text)

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。