2024-12-17 13:51:18 +08:00
|
|
|
import asyncio
|
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
|
|
from newspaper import Article # type: ignore
|
|
|
|
from sumy.nlp.tokenizers import Tokenizer # type: ignore
|
|
|
|
from sumy.parsers.plaintext import PlaintextParser # type: ignore
|
|
|
|
from sumy.summarizers.lsa import LsaSummarizer # type: ignore
|
|
|
|
|
|
|
|
executor = ThreadPoolExecutor()
|
|
|
|
|
|
|
|
|
|
|
|
async def make_html_summary(
|
|
|
|
html_content: str, language: str = "english", length: int = 3
|
|
|
|
) -> str:
|
|
|
|
"""使用html内容生成摘要
|
|
|
|
|
|
|
|
Args:
|
|
|
|
html_content (str): html内容
|
|
|
|
language (str, optional): 语言. Defaults to "english".
|
|
|
|
length (int, optional): 摘要长度. Defaults to 3.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
str: 摘要
|
|
|
|
"""
|
|
|
|
loop = asyncio.get_event_loop()
|
|
|
|
return await loop.run_in_executor(
|
|
|
|
executor, _make_summary, html_content, language, length
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def _make_summary(html_content: str, language: str, length: int) -> str:
|
|
|
|
parser = PlaintextParser.from_string(html_content, Tokenizer(language))
|
|
|
|
summarizer = LsaSummarizer()
|
|
|
|
summary = summarizer(parser.document, length)
|
|
|
|
return " ".join([str(sentence) for sentence in summary])
|