You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

120 lines
3.8 KiB
Python

import re
from bs4 import BeautifulSoup
from markdownify import MarkdownConverter
def stripMarkdown(text):
# Remove headers
text = re.sub(r'#+(\s+)', r'\1', text)
# Remove emphasis
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
text = re.sub(r'\*(.*?)\*', r'\1', text)
text = re.sub(r'__(.*?)__', r'\1', text)
text = re.sub(r'_(.*?)_', r'\1', text)
# Remove code blocks
text = re.sub(r'```(.*?)```', r' \1 ', text, flags=re.DOTALL)
# Remove inline code
text = re.sub(r'`(.*?)`', r' \1 ', text)
# Remove links
text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
return text.strip()
# 句号列表
stopList = ["", "", "", ".", "!", "?"]
def getWikiSentences(html, line_max_length=150):
# Remove all new-line
html = html.replace("\r", "").replace("\n", "")
soup = BeautifulSoup(html, 'html.parser')
parserOutput = soup.find(class_="mw-parser-output")
# Remove <style>
for style in parserOutput.find_all("style"):
style.decompose()
# Remove <caption> if empty
for caption in parserOutput.find_all("caption"):
if caption.string is None or caption.string.strip() == "":
caption.decompose()
mdContent = MarkdownConverter(heading_style="ATX", newline_style="BACKSLASH").convert_soup(parserOutput)
titlePath = []
currentDepth = 0
docList = []
for line in mdContent.split("\n"):
line = line.strip()
if line == "":
continue
if line[0] == "#":
# Title
depth = 0
while line[depth] == "#":
depth += 1
if depth > currentDepth:
for i in range(depth - currentDepth):
titlePath.append(None)
titlePath.append(line[depth:].strip())
else:
titlePath = titlePath[:depth]
titlePath.append(line[depth:].strip())
currentDepth = depth
elif line[0] == "|" and line[-1] == "|":
# Table
continue
else:
if line[0] == ">":
# Quote
line = line[1:].strip()
# Sentence
titlePathStr = "/".join([title for title in titlePath if title is not None])
if titlePathStr == "":
titlePathStr = "简介"
# Split line by stops
sentences = re.split(r"([。!?.!?]+)", line)
combinedSentences = []
lineChunk = []
lineLen = 0
# Combine stops to previous sentence, and combine line untill line_max_length
for i in range(len(sentences) - 1):
if sentences[i] == "":
continue
if re.match(r"^[。!?.!?]+$", sentences[i + 1]):
sentences[i] += sentences[i + 1]
sentences[i + 1] = ""
lineLen += len(sentences[i])
if lineLen > line_max_length:
combinedSentences.append("".join(lineChunk))
lineChunk = []
lineLen = len(sentences[i])
lineChunk.append(sentences[i])
if len(lineChunk) > 0:
combinedSentences.append("".join(lineChunk))
# generate doc list
for text in combinedSentences:
stripedText = stripMarkdown(text)
stripedText = titlePathStr + ": " + stripedText
text = titlePathStr + ": " + text
# Strip markdown syntax
doc = {
"text": stripedText,
"markdown": text,
"text_len": len(stripedText),
"markdown_len": len(text),
}
docList.append(doc)
return docList