Add voice recognition

2025-02-20 17:46:51 +08:00 · 2017-01-01 23:16:34 +08:00 · 2017-01-01 23:16:34 +08:00 · 1880409b7f
commit 1880409b7f
parent 6e86d36056
11 changed files with 156 additions and 15 deletions
--- a/4
+++ b/4
@ -9,4 +9,8 @@ COPY requirements.txt requirements.txt
 RUN pip install --upgrade pip
 RUN pip install -r requirements.txt

+RUN apt-get update \
+    && apt-get install -y ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+
 CMD python app.py
--- a/commands/natural_language.py
+++ b/commands/natural_language.py
@ -0,0 +1,12 @@
+import jieba
+
+from command import CommandRegistry
+
+__registry__ = cr = CommandRegistry()
+
+
+@cr.register('process')
+@cr.restrict(full_command_only=True)
+def process(args_text, ctx_msg, internal=False):
+    print('自然语言消息处理', args_text)
+    print(list(jieba.cut_for_search(args_text)))
--- a/commands/translate.py
+++ b/commands/translate.py
@ -97,6 +97,6 @@ def translate_to(args_text, ctx_msg):
        data = resp.json()
        print(data)
        if 'trans_result' in data:
-            core.echo('翻译结果：\n' + '\n'.join([x['dst'] for x in data['trans_result']]), ctx_msg)
+            core.echo('翻译结果（百度翻译）：\n' + '\n'.join([x['dst'] for x in data['trans_result']]), ctx_msg)
            return
    core.echo('翻译失败，可能因为后台接口的频率限制或服务器连接不上', ctx_msg)
--- a/config.py
+++ b/config.py
@ -1,5 +1,5 @@
 config = {
-    'fallback_command': 'core.chat',
+    'fallback_command': 'natural_language.process',
    'command_start_flags': ('/', '／', '来，', '来,'),
    'command_name_separators': ('\.', '->', '::', '/'),  # Regex
    'command_args_start_flags': ('，', '：', ',', ', ', ':', ': '),  # Regex
--- a/filters/command_dispatcher_0.py
+++ b/filters/command_dispatcher_0.py
@ -30,30 +30,33 @@ def _load_commands():


 def _dispatch_command(ctx_msg):
+    # noinspection PyBroadException
    try:
-        content = ctx_msg.get('content', '').lstrip()
+        text = ctx_msg.get('text', '').lstrip()
+        if not text:
+            raise SkipException
        source = get_source(ctx_msg)
        start_flag = None
        for flag in _command_start_flags:
            # Match the command start flag
-            if content.startswith(flag):
+            if text.startswith(flag):
                start_flag = flag
                break
-        if not start_flag or len(content) <= len(start_flag):
+        if not start_flag or len(text) <= len(start_flag):
            # No command, check if a session exists
            if interactive.has_session(source):
-                command = [interactive.get_session(source).cmd, content]
+                command = [interactive.get_session(source).cmd, text]
            else:
                # Use fallback
                if _fallback_command:
-                    command = [_fallback_command, content]
+                    command = [_fallback_command, text]
                else:
                    # No fallback
                    raise SkipException
        else:
            # Split command and arguments
            command = re.split('|'.join(_command_args_start_flags),
-                               content[len(start_flag):], 1)
+                               text[len(start_flag):], 1)
            if len(command) == 1:
                # Add an empty argument
                command.append('')
--- a/filters/how_to_use_1.py
+++ b/filters/how_to_use_1.py
@ -5,8 +5,9 @@ from commands import core
 def _print_help_message(ctx_msg):
    a = ['help', '怎么用', '怎么用啊', '你好', '你好啊', '帮助',
         '用法', '使用帮助', '使用指南', '使用说明', '使用方法',
-         '你能做什么', '你能做些什么', '你会做什么', '你会做些什么']
-    if ctx_msg.get('content', '').strip() in a:
+         '你能做什么', '你能做些什么', '你会做什么', '你会做些什么',
+         '你可以做什么', '你可以做些什么']
+    if ctx_msg.get('text', '').strip() in a:
        core.help('', ctx_msg)
        return False
    return True
--- a/filters/intercept_some_message_formats_100.py
+++ b/filters/intercept_some_message_formats_100.py
@ -0,0 +1,22 @@
+"""
+This filter intercepts messages that contains content not allowed and move text content to 'text' field.
+"""
+
+from filter import add_filter
+
+
+def _filter(ctx_msg):
+    if ctx_msg.get('via') == 'wx':
+        msg_format = ctx_msg.get('format')
+        if msg_format != 'text' and ctx_msg.get('type') != 'friend_message':
+            return False
+        if msg_format not in ('text', 'media'):
+            return False
+        if msg_format == 'text':
+            ctx_msg['text'] = ctx_msg.get('content')
+    elif ctx_msg.get('via') == 'qq':
+        ctx_msg['text'] = ctx_msg.get('content')
+    return True
+
+
+add_filter(_filter, 100)
--- a/filters/message_logger_1000.py
+++ b/filters/message_logger_1000.py
@ -1,3 +1,7 @@
+"""
+This filter just log message to stdout.
+"""
+
 from filter import add_filter


--- a/filters/speech_recognition_90.py
+++ b/filters/speech_recognition_90.py
@ -0,0 +1,88 @@
+"""
+This filter recognizes speech in voice message and stores it in 'text' field of context message.
+"""
+
+import re
+import os
+import base64
+
+import requests
+from pydub import AudioSegment
+import speech_recognition as sr
+
+from filter import add_filter
+from commands import core
+
+
+def _recognize_baidu(wav_path, unique_id, api_key, secret_key, language='zh'):
+    api_url = 'http://vop.baidu.com/server_api'
+    auth_url = 'https://openapi.baidu.com/oauth/2.0/token?grant_type=client_credentials&client_id=%s&client_secret=%s' \
+               % (api_key, secret_key)
+    resp = requests.get(auth_url)
+    if resp.status_code == 200:
+        data = resp.json()
+        if data and 'access_token' in data:
+            token = data['access_token']
+            with open(wav_path, 'rb') as f:
+                audio_data = f.read()
+            audio_data_b64 = base64.b64encode(audio_data).decode('utf-8')
+            json = {
+                'format': 'wav',
+                'rate': 8000,
+                'channel': 1,
+                'cuid': unique_id,
+                'token': token,
+                'lan': language,
+                'speech': audio_data_b64,
+                'len': len(audio_data)
+            }
+            resp = requests.post(api_url, json=json)
+            if resp.status_code == 200:
+                data = resp.json()
+                if data and 'result' in data:
+                    return ''.join(data['result']).strip('，。？！')
+    return None
+
+
+def _recognize_bing(wav_path, api_key, language='zh-CN'):
+    r = sr.Recognizer()
+    with sr.AudioFile(wav_path) as source:
+        audio = r.record(source)
+    try:
+        text = r.recognize_bing(audio, key=api_key, language=language)
+        return text
+    except (sr.UnknownValueError, sr.RequestError):
+        return None
+
+
+def _filter(ctx_msg):
+    if ctx_msg.get('via') == 'wx' and ctx_msg.get('format') == 'media' and ctx_msg.get('media_type') == 'voice':
+        m = re.match('\[语音\]\(([/_A-Za-z0-9]+\.mp3)\)', ctx_msg.get('content'))
+        if m:
+            core.echo('正在识别语音内容，请稍等……', ctx_msg)
+            mp3_path = m.group(1)
+            wav_path = os.path.splitext(mp3_path)[0] + '.wav'
+            voice = AudioSegment.from_mp3(mp3_path)
+            voice.export(wav_path, format='wav')
+            text = _recognize_baidu(
+                wav_path,
+                ctx_msg.get('sender_id')[-60:],
+                os.environ.get('BAIDU_SPEECH_API_KEY'),
+                os.environ.get('BAIDU_SPEECH_SECRET_KEY'),
+                language='zh'
+            )
+            # text = _recognize_bing(
+            #     wav_path,
+            #     os.environ.get('BING_SPEECH_API_KEY'),
+            #     language='zh-CN'
+            # )
+            if text:
+                reply = '识别结果（百度语音识别）：\n%s\n\n下面将把识别到的内容作为文字消息处理……' % text
+                ctx_msg['text'] = text
+            else:
+                reply = '抱歉哦，没有识别出你说的是什么'
+            core.echo(reply, ctx_msg)
+            os.remove(wav_path)
+
+
+add_filter(_filter, 90)
--- a/filters/split_at_xiaokai_50.py
+++ b/filters/split_at_xiaokai_50.py
@ -1,21 +1,25 @@
+"""
+This filter intercepts messages not intended to the bot and removes the beginning "@xxx".
+"""
+
 from filter import add_filter


 def _split_at_xiaokai(ctx_msg):
    if ctx_msg.get('type') == 'group_message' or ctx_msg.get('type') == 'discuss_message':
-        content = ctx_msg.get('content', '')
-        if content.startswith('@'):
+        text = ctx_msg.get('text', '')
+        if text.startswith('@'):
            my_group_nick = ctx_msg.get('receiver')
            if not my_group_nick:
                return False
            at_me = '@' + my_group_nick
-            if not content.startswith(at_me):
+            if not text.startswith(at_me):
                return False
-            content = content[len(at_me):]
+            text = text[len(at_me):]
        else:
            # Not starts with '@'
            return False
-        ctx_msg['content'] = content.lstrip()
+        ctx_msg['text'] = text.lstrip()
    return True


--- a/requirements.txt
+++ b/requirements.txt
@ -5,3 +5,6 @@ cachetools
 pytz
 flask
 sqlalchemy
+pydub
+SpeechRecognition
+jieba