优化网页内容获取功能,添加摘要生成支持,重构相关函数

This commit is contained in:
远野千束(神羽) 2024-12-17 13:51:18 +08:00
parent 4d5af4bc00
commit 777e577a17
4 changed files with 59 additions and 18 deletions

View File

@ -350,10 +350,12 @@ async def marsho(
tool_call.function.arguments.replace("'", '"')
)
logger.info(
f"调用函数 {tool_call.function.name} ,参数为 {function_args}"
f"调用函数 {tool_call.function.name.replace("-", ".")}\n参数:"
+ "\n".join([f"{k}={v}" for k, v in function_args.items()])
)
await UniMessage(
f"调用函数 {tool_call.function.name} ,参数为 {function_args}"
f"调用函数 {tool_call.function.name.replace("-", ".")}\n参数:"
+ "\n".join([f"{k}={v}" for k, v in function_args.items()])
).send()
# TODO 临时追加插件函数,若工具中没有则调用插件函数
if tools.has_function(tool_call.function.name):

View File

@ -1,12 +1,12 @@
import time
from httpx import AsyncClient
from newspaper import Article
from newspaper import Article # type: ignore
from nonebot import logger
from nonebot_plugin_marshoai.plugin.func_call.caller import on_function_call
from nonebot_plugin_marshoai.plugin.func_call.params import String
from .utils import make_html_summary
headers = {
"User-Agent": "Firefox/90.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"
}
@ -16,9 +16,9 @@ headers = {
description="使用网页链接(url)获取网页内容摘要,可以让AI上网查询资料"
).params(
url=String(description="网页链接"),
typ=String(description="获取类型,摘要还是内容", enum=["摘要", "内容"]),
typ=String(description="获取类型,摘要还是内容"),
)
async def get_web_content(url: str, typ: str) -> str:
async def get_web_content(url: str) -> str:
"""使用网页链接获取网页内容摘要
为什么要获取摘要不然token超限了
@ -31,16 +31,19 @@ async def get_web_content(url: str, typ: str) -> str:
async with AsyncClient(headers=headers) as client:
try:
response = await client.get(url)
t1 = time.time()
if response.status_code == 200:
article = Article(url)
article.set_html(response.text)
article.download(input_html=response.text)
article.parse()
t2 = time.time()
logger.debug(f"获取网页内容耗时: {t2 - t1}")
if typ == "摘要":
return f"标题: {article.title}\n作者: {article.authors}\n发布日期: {article.publish_date}"
elif typ == "内容":
return f"标题: {article.title}\n作者: {article.authors}\n发布日期: {article.publish_date}\n摘要: {article.summary}\n正文: {article.text}"
if article.text:
return article.text
elif article.html:
return await make_html_summary(article.html)
else:
return "未能获取到有效的网页内容"
else:
return "获取网页内容失败" + str(response.status_code)
except Exception as e:
logger.error(f"marsho builtin: 获取网页内容失败: {e}")
return "获取网页内容失败:" + str(e)

View File

@ -0,0 +1,35 @@
import asyncio
from concurrent.futures import ThreadPoolExecutor
from newspaper import Article # type: ignore
from sumy.nlp.tokenizers import Tokenizer # type: ignore
from sumy.parsers.plaintext import PlaintextParser # type: ignore
from sumy.summarizers.lsa import LsaSummarizer # type: ignore
executor = ThreadPoolExecutor()
async def make_html_summary(
html_content: str, language: str = "english", length: int = 3
) -> str:
"""使用html内容生成摘要
Args:
html_content (str): html内容
language (str, optional): 语言. Defaults to "english".
length (int, optional): 摘要长度. Defaults to 3.
Returns:
str: 摘要
"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
executor, _make_summary, html_content, language, length
)
def _make_summary(html_content: str, language: str, length: int) -> str:
parser = PlaintextParser.from_string(html_content, Tokenizer(language))
summarizer = LsaSummarizer()
summary = summarizer(parser.document, length)
return " ".join([str(sentence) for sentence in summary])

View File

@ -24,7 +24,8 @@ dependencies = [
"litedoc>=0.1.0.dev20241214103915",
"newspaper3k>=0.2.8",
"lxml[html_clean]>=5.3.0",
"aiofiles>=24.1.0"
"aiofiles>=24.1.0",
"sumy>=0.11.0"
]
license = { text = "MIT, Mulan PSL v2" }