import re from bs4 import BeautifulSoup from markdownify import MarkdownConverter def stripMarkdown(text): # Remove headers text = re.sub(r'#+(\s+)', r'\1', text) # Remove emphasis text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) text = re.sub(r'\*(.*?)\*', r'\1', text) text = re.sub(r'__(.*?)__', r'\1', text) text = re.sub(r'_(.*?)_', r'\1', text) # Remove code blocks text = re.sub(r'```(.*?)```', r' \1 ', text, flags=re.DOTALL) # Remove inline code text = re.sub(r'`(.*?)`', r' \1 ', text) # Remove links text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text) return text.strip() # 句号列表 stopList = ["。", "!", "?", ".", "!", "?"] def getWikiSentences(html, line_max_length=150): # Remove all new-line html = html.replace("\r", "").replace("\n", "") soup = BeautifulSoup(html, 'html.parser') parserOutput = soup.find(class_="mw-parser-output") # Remove