✨ 优化网页内容获取功能，添加摘要生成支持，重构相关函数

2025-01-26 18:12:47 +08:00 · 2024-12-17 13:51:18 +08:00 · 2024-12-17 13:51:18 +08:00 · 777e577a17
commit 777e577a17
parent 4d5af4bc00
4 changed files with 59 additions and 18 deletions
--- a/nonebot_plugin_marshoai/azure.py
+++ b/nonebot_plugin_marshoai/azure.py
@ -350,10 +350,12 @@ async def marsho(
                                tool_call.function.arguments.replace("'", '"')
                            )
                        logger.info(
-                            f"调用函数 {tool_call.function.name} ,参数为 {function_args}"
+                            f"调用函数 {tool_call.function.name.replace("-", ".")}\n参数:"
+                            + "\n".join([f"{k}={v}" for k, v in function_args.items()])
                        )
                        await UniMessage(
-                            f"调用函数 {tool_call.function.name} ,参数为 {function_args}"
+                            f"调用函数 {tool_call.function.name.replace("-", ".")}\n参数:"
+                            + "\n".join([f"{k}={v}" for k, v in function_args.items()])
                        ).send()
                        # TODO 临时追加插件函数，若工具中没有则调用插件函数
                        if tools.has_function(tool_call.function.name):
--- a/nonebot_plugin_marshoai/plugins/builtin_tools/network.py
+++ b/nonebot_plugin_marshoai/plugins/builtin_tools/network.py
@ -1,12 +1,12 @@
-import time
-
 from httpx import AsyncClient
-from newspaper import Article
+from newspaper import Article  # type: ignore
 from nonebot import logger

 from nonebot_plugin_marshoai.plugin.func_call.caller import on_function_call
 from nonebot_plugin_marshoai.plugin.func_call.params import String

+from .utils import make_html_summary
+
 headers = {
    "User-Agent": "Firefox/90.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"
 }
@ -16,9 +16,9 @@ headers = {
    description="使用网页链接(url)获取网页内容摘要,可以让AI上网查询资料"
 ).params(
    url=String(description="网页链接"),
-    typ=String(description="获取类型，摘要还是内容", enum=["摘要", "内容"]),
+    typ=String(description="获取类型，摘要还是内容"),
 )
-async def get_web_content(url: str, typ: str) -> str:
+async def get_web_content(url: str) -> str:
    """使用网页链接获取网页内容摘要
    为什么要获取摘要，不然token超限了

@ -31,16 +31,19 @@ async def get_web_content(url: str, typ: str) -> str:
    async with AsyncClient(headers=headers) as client:
        try:
            response = await client.get(url)
-            t1 = time.time()
+            if response.status_code == 200:
                article = Article(url)
-            article.set_html(response.text)
+                article.download(input_html=response.text)
                article.parse()
-            t2 = time.time()
-            logger.debug(f"获取网页内容耗时: {t2 - t1}")
-            if typ == "摘要":
-                return f"标题: {article.title}\n作者: {article.authors}\n发布日期: {article.publish_date}"
-            elif typ == "内容":
-                return f"标题: {article.title}\n作者: {article.authors}\n发布日期: {article.publish_date}\n摘要: {article.summary}\n正文: {article.text}"
+                if article.text:
+                    return article.text
+                elif article.html:
+                    return await make_html_summary(article.html)
+                else:
+                    return "未能获取到有效的网页内容"
+            else:
+                return "获取网页内容失败" + str(response.status_code)
+
        except Exception as e:
            logger.error(f"marsho builtin: 获取网页内容失败: {e}")
            return "获取网页内容失败：" + str(e)
--- a/nonebot_plugin_marshoai/plugins/builtin_tools/utils.py
+++ b/nonebot_plugin_marshoai/plugins/builtin_tools/utils.py
@ -0,0 +1,35 @@
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+
+from newspaper import Article  # type: ignore
+from sumy.nlp.tokenizers import Tokenizer  # type: ignore
+from sumy.parsers.plaintext import PlaintextParser  # type: ignore
+from sumy.summarizers.lsa import LsaSummarizer  # type: ignore
+
+executor = ThreadPoolExecutor()
+
+
+async def make_html_summary(
+    html_content: str, language: str = "english", length: int = 3
+) -> str:
+    """使用html内容生成摘要
+
+    Args:
+        html_content (str): html内容
+        language (str, optional): 语言. Defaults to "english".
+        length (int, optional): 摘要长度. Defaults to 3.
+
+    Returns:
+        str: 摘要
+    """
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(
+        executor, _make_summary, html_content, language, length
+    )
+
+
+def _make_summary(html_content: str, language: str, length: int) -> str:
+    parser = PlaintextParser.from_string(html_content, Tokenizer(language))
+    summarizer = LsaSummarizer()
+    summary = summarizer(parser.document, length)
+    return " ".join([str(sentence) for sentence in summary])
--- a/pyproject.toml
+++ b/pyproject.toml
@ -24,7 +24,8 @@ dependencies = [
    "litedoc>=0.1.0.dev20241214103915",
    "newspaper3k>=0.2.8",
    "lxml[html_clean]>=5.3.0",
-    "aiofiles>=24.1.0"
+    "aiofiles>=24.1.0",
+    "sumy>=0.11.0"

 ]
 license = { text = "MIT, Mulan PSL v2" }