import re from bs4 import BeautifulSoup from markdownify import MarkdownConverter def stripMarkdown(text): # Remove headers text = re.sub(r'#+(\s+)', r'\1', text) # Remove emphasis text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) text = re.sub(r'\*(.*?)\*', r'\1', text) text = re.sub(r'__(.*?)__', r'\1', text) text = re.sub(r'_(.*?)_', r'\1', text) # Remove code blocks text = re.sub(r'```(.*?)```', r' \1 ', text, flags=re.DOTALL) # Remove inline code text = re.sub(r'`(.*?)`', r' \1 ', text) # Remove links text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text) return text.strip() # 句号列表 stopList = ["。", "!", "?", ".", "!", "?"] def getWikiSentences(html, line_max_length=150): # Remove all new-line html = html.replace("\r", "").replace("\n", "") soup = BeautifulSoup(html, 'html.parser') parserOutput = soup.find(class_="mw-parser-output") # Remove <style> for style in parserOutput.find_all("style"): style.decompose() # Remove <caption> if empty for caption in parserOutput.find_all("caption"): if caption.string is None or caption.string.strip() == "": caption.decompose() mdContent = MarkdownConverter(heading_style="ATX", newline_style="BACKSLASH").convert_soup(parserOutput) titlePath = [] currentDepth = 0 docList = [] for line in mdContent.split("\n"): line = line.strip() if line == "": continue if line[0] == "#": # Title depth = 0 while line[depth] == "#": depth += 1 if depth > currentDepth: for i in range(depth - currentDepth): titlePath.append(None) titlePath.append(line[depth:].strip()) else: titlePath = titlePath[:depth] titlePath.append(line[depth:].strip()) currentDepth = depth elif line[0] == "|" and line[-1] == "|": # Table continue else: if line[0] == ">": # Quote line = line[1:].strip() # Sentence titlePathStr = "/".join([title for title in titlePath if title is not None]) if titlePathStr == "": titlePathStr = "简介" # Split line by stops sentences = re.split(r"([。!?.!?]+)", line) combinedSentences = [] lineChunk = [] lineLen = 0 # Combine stops to previous sentence, and combine line untill line_max_length for i in range(len(sentences) - 1): if sentences[i] == "": continue if re.match(r"^[。!?.!?]+$", sentences[i + 1]): sentences[i] += sentences[i + 1] sentences[i + 1] = "" lineLen += len(sentences[i]) if lineLen > line_max_length: combinedSentences.append("".join(lineChunk)) lineChunk = [] lineLen = len(sentences[i]) lineChunk.append(sentences[i]) if len(lineChunk) > 0: combinedSentences.append("".join(lineChunk)) # generate doc list for text in combinedSentences: stripedText = stripMarkdown(text) stripedText = titlePathStr + ": " + stripedText text = titlePathStr + ": " + text # Strip markdown syntax doc = { "text": stripedText, "markdown": text, "text_len": len(stripedText), "markdown_len": len(text), } docList.append(doc) return docList