Skip to content

Commit

Permalink
Merge pull request SpeechColab#39 from Lizerui9926/master
Browse files Browse the repository at this point in the history
Modify aliyun_ftasr sdk
  • Loading branch information
dophist authored Sep 3, 2022
2 parents 17b9cab + e40c66e commit 1dcf7c1
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 22 deletions.
20 changes: 16 additions & 4 deletions models/aliyun_ftasr_api_zh/SBI
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,18 @@ fi
scp=$1
dir=$2
oss_scp=${dir}/wav_oss.scp
oss_loss_scp=${dir}/wav_oss_loss.scp
part_nums=10
loss_base_nums=10
test_time=$(date +"%Y-%m-%d-%H-%M-%S")
echo "Test starts at "${test_time}

# 命令行工具ossutil 下载和安装 https://help.aliyun.com/document_detail/120075.html
# 可通过命令(wget https://gosspublic.alicdn.com/ossutil/1.7.13/ossutil64) 获取相同版本ossutil64(版本1.7.13) 进行md5sum一致性比较
# compare ossutil64 md5sum
md5sum ossutil64 > ${dir}/ossutil64.md5.compare
diff -r ${dir}/ossutil64.md5.compare ./ossutil64.md5.txt
if [ $? -ne 0 ];then
diff -r ${dir}/ossutil64.md5.compare ./ossutil64.md5.txt > ${dir}/ossutil64.md5.compare.diff
if [ -s ${dir}/ossutil64.md5.compare.diff ];then
echo "md5sum of ossutil64 is different! Exit!"
exit 1
else
Expand All @@ -26,7 +28,17 @@ fi


# upload wav to oss
./upload_wav.py ${scp} ${oss_scp} ${dir} ${test_time}
./upload_wav.py ${scp} ${dir} speechiotest-${test_time}

# check oss wav
./check_wav.py speechiotest-${test_time} ${scp} ${oss_loss_scp} ${oss_scp}
loss_wav_num=$(cat $oss_loss_scp | wc -l)
if [ ${loss_wav_num} -gt ${loss_base_nums} ];then
echo "Not found some audio in oss bucket! Please check" ${oss_loss_scp}
exit 1
else
echo "Upload audio successfully. Continue..."
fi

# split test set
awk -vf=${part_nums} -vl="`wc -l $oss_scp`" 'BEGIN{p=int(l/f);q=(l%f);for(n=1;n<=f;n++)a[n]=n*p+((n<=q)?++x:x)}{if(NR>a[i])i++;print > "'$oss_scp'."i}' $oss_scp
Expand All @@ -44,4 +56,4 @@ rm -rf ${dir}/raw_rec.txt.*
rm -rf ${oss_scp}.*

# delete oss bucket
./del_oss_bucket.py ${dir} ${test_time}
./del_oss_bucket.py ${dir} speechiotest-${test_time}
85 changes: 85 additions & 0 deletions models/aliyun_ftasr_api_zh/check_wav.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
录音文件识别 接口说明
https://help.aliyun.com/document_detail/90727.html
通过ossutil命令获取文件URL
egs: 为目标存储空间examplebucket下的文件exampleobject.png生成文件URL,并指定超时时间为3600秒。
./ossutil64 sign oss://examplebucket/exampleobject.png --timeout 3600
'''
import sys
import os
import time
import codecs
import oss2


access_key_id = ""
with open('ACCESS_KEY_ID', 'r') as f:
access_key_id = f.readline().strip()

access_key_secret = ""
with open('ACCESS_KEY_SECRET', 'r') as f:
access_key_secret = f.readline().strip()

def retry_upload(wavId, wavPath, ossPath, times=10):
upload_command = "./ossutil64 cp {wav} {osspath} -e oss-cn-hangzhou.aliyuncs.com -i {access_key_id} -k {access_key_secret}".format(wav=wavPath, osspath=ossPath, access_key_id=access_key_id, access_key_secret=access_key_secret)
upload_res = os.popen(upload_command)
for r in upload_res:
print(wavId + "\t" + r.strip())
re_exist = ossbucket.object_exists('wav/{wavId}.wav'.format(wavId=wavId))
retry_flag = False
if re_exist:
print('{wavId} {wavPath} object exist, retry {time} times successfully.'.format(wavId=wavId, wavPath=wavPath, time=10-times))
sign_command = "./ossutil64 sign {osspath}{idx}.wav --timeout {times} -e oss-cn-hangzhou.aliyuncs.com -i {access_key_id} -k {access_key_secret}".format(osspath=ossPath, idx=wavId, times=32400, access_key_id=access_key_id, access_key_secret=access_key_secret)
res = os.popen(sign_command)
for audio in res:
wavOssFile.write(wavId + "\t" + audio)
break
retry_flag = True
return retry_flag
elif times > 0:
time.sleep(5)
print('{wavId} {wavPath} object not exist, will retry upload {time} times.'.format(wavId=wavId, wavPath=wavPath, time=times))
retry_flag = retry_upload(wavId, wavPath, ossPath, times=times-1)
return retry_flag


if __name__ == "__main__":
if len(sys.argv) != 5:
sys.stderr.write("check_wav.py <bucket_name> <wav_scp> <wav_loss_scp> <oss_out_scp>\n")
exit(-1)

bucketName = sys.argv[1]
wavFile = codecs.open(sys.argv[2], 'r', 'utf8')
wavLossFile = codecs.open(sys.argv[3], 'w+', 'utf8')
wavOssFile = codecs.open(sys.argv[4], 'w+', 'utf8')

auth = oss2.Auth(access_key_id, access_key_secret)
ossbucket = oss2.Bucket(auth, 'https://oss-cn-hangzhou.aliyuncs.com', bucketName)

bucket = "oss://{bucketName}".format(bucketName=bucketName)
ossPath = bucket + "/wav/"
for meta in wavFile:
wavmeta = meta.split()
wavId = wavmeta[0]
wavPath = wavmeta[1]

exist = ossbucket.object_exists('wav/{wavId}.wav'.format(wavId=wavId))
if exist:
print('{wavId} {wavPath} object exist'.format(wavId=wavId, wavPath=wavPath))
# 生成签名URL
sign_command = "./ossutil64 sign {osspath}{idx}.wav --timeout {times} -e oss-cn-hangzhou.aliyuncs.com -i {access_key_id} -k {access_key_secret}".format(osspath=ossPath, idx=wavId, times=32400, access_key_id=access_key_id, access_key_secret=access_key_secret)
res = os.popen(sign_command)
for audio in res:
wavOssFile.write(wavId + "\t" + audio)
break
else:
print('{wavId} {wavPath} object not exist'.format(wavId=wavId, wavPath=wavPath))
retry_flag = retry_upload(wavId, wavPath, ossPath, times=10)
if not retry_flag:
wavLossFile.write(meta)
wavLossFile.flush()

wavFile.close()
wavLossFile.close()
9 changes: 6 additions & 3 deletions models/aliyun_ftasr_api_zh/del_oss_bucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@

if __name__ == "__main__":
if len(sys.argv) != 3:
sys.stderr.write("del_oss_bucket.py <dir> <test_time>\n")
sys.stderr.write("del_oss_bucket.py <dir> <bucket_name>\n")
exit(-1)
print("decoding finish")
oss_config_path = sys.argv[1]
test_time = sys.argv[2]
bucketName = sys.argv[2]

bucket = "oss://speechiotest-{test_time}".format(test_time=test_time)
bucket = "oss://{bucketName}".format(bucketName=bucketName)
del_command = "./ossutil64 rm {bucket} -b -a -r -f -e oss-cn-hangzhou.aliyuncs.com -i {access_key_id} -k {access_key_secret}".format(bucket=bucket, access_key_id=access_key_id, access_key_secret=access_key_secret)
del_res = os.popen(del_command)
for u in del_res:
print(u)
time.sleep(20)
2 changes: 1 addition & 1 deletion models/aliyun_ftasr_api_zh/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3-pip && \
rm -rf /var/lib/apt/lists/*

RUN pip3 install aliyun-python-sdk-core==2.13.3
RUN pip3 install oss2 aliyun-python-sdk-core==2.13.3

# Use C.UTF-8 locale to avoid issues with ASCII encoding
ENV LC_ALL=C.UTF-8
Expand Down
25 changes: 11 additions & 14 deletions models/aliyun_ftasr_api_zh/upload_wav.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,17 @@
access_key_secret = f.readline().strip()

if __name__ == "__main__":
if len(sys.argv) != 5:
sys.stderr.write("upload_wav.py <wav_scp> <oss_out_scp> <dir> <test_time>\n")
if len(sys.argv) != 4:
sys.stderr.write("upload_wav.py <wav_scp> <dir> <bucket_name>\n")
exit(-1)

wavDict = {}
wavFile = codecs.open(sys.argv[1], 'r', 'utf8')
oss_config_path = sys.argv[3]
test_time = sys.argv[4]
oss_config_path = sys.argv[2]
bucketName = sys.argv[3]

# OSS bucket
bucket = "oss://speechiotest-{test_time}/".format(test_time=test_time)
bucket = "oss://{bucketName}".format(bucketName=bucketName)
for meta in wavFile:
meta = meta.split()
wavId = meta[0]
Expand All @@ -40,17 +40,14 @@
# 创建bucket
md_command = "./ossutil64 mb {bucket} -e oss-cn-hangzhou.aliyuncs.com -i {access_key_id} -k {access_key_secret}".format(bucket=bucket, access_key_id=access_key_id, access_key_secret=access_key_secret)
md_res = os.popen(md_command)
for r in md_res:
print(r)

time.sleep(5)
ossPath = bucket + "wav/"
ossWavFile = codecs.open(sys.argv[2], 'w+', 'utf8')
for idx, wav in wavDict.items():
ossPath = bucket + "/wav/"
for wavIdx, wav in wavDict.items():
# 上传文件
upload_command = "./ossutil64 cp {wav} {osspath} -e oss-cn-hangzhou.aliyuncs.com -i {access_key_id} -k {access_key_secret}".format(wav=wav, osspath=ossPath, access_key_id=access_key_id, access_key_secret=access_key_secret)
upload_res = os.popen(upload_command)
# 生成签名URL
sign_command = "./ossutil64 sign {osspath}{idx}.wav --timeout {times} -e oss-cn-hangzhou.aliyuncs.com -i {access_key_id} -k {access_key_secret}".format(osspath=ossPath, idx=idx, times=32400, access_key_id=access_key_id, access_key_secret=access_key_secret)
res = os.popen(sign_command)
for audio in res:
ossWavFile.write(idx + "\t" + audio)
break
for r in upload_res:
print(wavIdx + "\t" + r.strip())

0 comments on commit 1dcf7c1

Please sign in to comment.