From 882810378d6c3f30e35ade881cef112653889d8d Mon Sep 17 00:00:00 2001 From: Richard Chien Date: Mon, 2 Jan 2017 23:51:19 +0800 Subject: [PATCH] Try to add the first natural language processor - 'translate' --- commands/natural_language.py | 37 ++++++++++++++++++++++++++----- commands/translate.py | 24 ++++++++++++++++++-- filters/speech_recognition_90.py | 1 + little_shit.py | 4 ++++ nl_processor.py | 38 ++++++++++++++++++++++++++++++++ nl_processors/translate.py | 31 ++++++++++++++++++++++++++ 6 files changed, 128 insertions(+), 7 deletions(-) create mode 100644 nl_processor.py create mode 100644 nl_processors/translate.py diff --git a/commands/natural_language.py b/commands/natural_language.py index b9d2b0ce..cb4a4800 100644 --- a/commands/natural_language.py +++ b/commands/natural_language.py @@ -1,12 +1,39 @@ -import jieba +import os +import importlib from command import CommandRegistry +from commands import core +from nl_processor import parse_potential_commands +from little_shit import get_nl_processors_dir +from command import hub as cmdhub -__registry__ = cr = CommandRegistry() + +def _init(): + _load_processors() + + +__registry__ = cr = CommandRegistry(init_func=_init) @cr.register('process') @cr.restrict(full_command_only=True) -def process(args_text, ctx_msg, internal=False): - print('自然语言消息处理', args_text) - print(list(jieba.cut_for_search(args_text))) +def process(sentence, ctx_msg, internal=False): + sentence = sentence.strip() + potential_commands = parse_potential_commands(sentence) + potential_commands = sorted(filter(lambda x: x[0] > 60, potential_commands), key=lambda x: x[0], reverse=True) + if len(potential_commands) > 0: + most_possible_cmd = potential_commands[0] + ctx_msg['parsed_data'] = most_possible_cmd[3] + cmdhub.call(most_possible_cmd[1], most_possible_cmd[2], ctx_msg) + else: + core.echo('我暂时不理解你在说什么哦~', ctx_msg, internal) + + +def _load_processors(): + processor_mod_files = filter( + lambda filename: filename.endswith('.py') and not filename.startswith('_'), + os.listdir(get_nl_processors_dir()) + ) + command_mods = [os.path.splitext(file)[0] for file in processor_mod_files] + for mod_name in command_mods: + importlib.import_module('nl_processors.' + mod_name) diff --git a/commands/translate.py b/commands/translate.py index f27fafe9..d9b82590 100644 --- a/commands/translate.py +++ b/commands/translate.py @@ -47,7 +47,27 @@ _lang_alias_map = { '汉语': 'zh', '英文': 'en', '日文': 'jp', - '韩文': 'kor' + '韩文': 'kor', + '法文': 'fra', + '西班牙文': 'spa', + '阿拉伯文': 'ara', + '俄文': 'ru', + '葡萄牙文': 'pt', + '德文': 'de', + '意大利文': 'it', + '希腊文': 'el', + '荷兰文': 'nl', + '波兰文': 'pl', + '保加利亚文': 'bul', + '爱沙尼亚文': 'est', + '丹麦文': 'dan', + '芬兰文': 'fin', + '捷克文': 'cs', + '罗马尼亚文': 'rom', + '斯洛文尼亚文': 'slo', + '瑞典文': 'swe', + '匈牙利文': 'hu', + '越南文': 'vie' } @@ -67,7 +87,7 @@ def translate(args_text, ctx_msg): return translate_to('简体中文 ' + args_text, ctx_msg) -@cr.register('translate_to', 'translate-to', '翻译到', '翻译成') +@cr.register('translate_to', 'translate-to', '翻译到', '翻译成', '翻译为') def translate_to(args_text, ctx_msg): args = args_text.strip().split(' ', 1) if len(args) < 2 or (args[0] not in _lang_map and args[0] not in _lang_alias_map): diff --git a/filters/speech_recognition_90.py b/filters/speech_recognition_90.py index 9b45423c..be10f952 100644 --- a/filters/speech_recognition_90.py +++ b/filters/speech_recognition_90.py @@ -80,6 +80,7 @@ def _filter(ctx_msg): if text: reply = '识别结果(百度语音识别):\n%s\n\n下面将把识别到的内容作为文字消息处理……' % text ctx_msg['text'] = text + ctx_msg['from_voice'] = True else: reply = '抱歉哦,没有识别出你说的是什么' core.echo(reply, ctx_msg) diff --git a/little_shit.py b/little_shit.py index d98dbea5..e874fbd8 100644 --- a/little_shit.py +++ b/little_shit.py @@ -26,6 +26,10 @@ def get_commands_dir(): return _mkdir_if_not_exists_and_return_path(os.path.join(get_root_dir(), 'commands')) +def get_nl_processors_dir(): + return _mkdir_if_not_exists_and_return_path(os.path.join(get_root_dir(), 'nl_processors')) + + def get_db_dir(): return _mkdir_if_not_exists_and_return_path(os.path.join(get_root_dir(), 'data', 'db')) diff --git a/nl_processor.py b/nl_processor.py new file mode 100644 index 00000000..d20073bf --- /dev/null +++ b/nl_processor.py @@ -0,0 +1,38 @@ +import re + +import jieba.posseg + +_processors = [] +_processors_without_keyword = [] + + +def as_processor(keywords=None): + def decorator(func): + if keywords: + _processors.append((keywords, func)) + else: + _processors_without_keyword.append(func) + return func + + return decorator + + +def parse_potential_commands(sentence): + segmentation = list(jieba.posseg.cut(sentence=sentence)) + print('分词结果:', segmentation) + potential_commands = [] + for processor in _processors: + processed = False + for regex in processor[0]: + for word, flag in segmentation: + if re.match(regex, word): + potential_commands.append(processor[1](sentence, segmentation)) + processed = True + # A word matched, skip the rest of words + break + if processed: + # Current processor has processed, skip the rest of keywords + break + for func in _processors_without_keyword: + potential_commands.append(func(sentence, segmentation)) + return potential_commands diff --git a/nl_processors/translate.py b/nl_processors/translate.py new file mode 100644 index 00000000..b3888ded --- /dev/null +++ b/nl_processors/translate.py @@ -0,0 +1,31 @@ +import re + +from nl_processor import as_processor + +_query_lang_matcher = [ + re.compile('[把将]?[ ,.,。]?(.*?)[ ,.,。]?(?:这[个]?(?:词[组]?|句(?:子|话)?|短语))翻译[成为到](\w+?[文语])(?![ ::,,.。])'), + re.compile('(\w+?)[ ,.,。]?(?:这[个]?(?:词[组]?|句(?:子|话)?|短语))?[的用](\w+?[文语])') +] + +_lang_query_matcher = [ + re.compile('[把将]?(?:(?:这[个]?|[下后][面]?)(?:词[组]?|句(?:子|话)?|短语))翻译[成为到](\w+?[文语])[ ::,,.。](.*)'), + re.compile('[用]?(\w+[文语])\w+?(?:说|讲|表达|表示)(.*)(?:这[个]?(?:词[组]?|句(?:子|话)?|短语))'), + re.compile('[用]?(\w+[文语])\w+?(?:说|讲|表达|表示)(.*)') +] + + +@as_processor(keywords=('翻译(为|成|到)?', '.+(文|语)')) +def _processor(sentence, segmentation): + lang = None + query = None + for matcher in _query_lang_matcher + _lang_query_matcher: + m = matcher.match(sentence) + if m: + if matcher in _lang_query_matcher: + lang, query = m.group(1), m.group(2) + else: + lang, query = m.group(2), m.group(1) + break + if lang and query: + return 90, 'translate.translate_to', ' '.join((lang.strip(), query.strip(' ,,'))), None + return None