|
|
import re
|
|
|
from bs4 import BeautifulSoup
|
|
|
from markdownify import MarkdownConverter
|
|
|
|
|
|
def stripMarkdown(text):
|
|
|
# Remove headers
|
|
|
text = re.sub(r'#+(\s+)', r'\1', text)
|
|
|
|
|
|
# Remove emphasis
|
|
|
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
|
|
|
text = re.sub(r'\*(.*?)\*', r'\1', text)
|
|
|
text = re.sub(r'__(.*?)__', r'\1', text)
|
|
|
text = re.sub(r'_(.*?)_', r'\1', text)
|
|
|
|
|
|
# Remove code blocks
|
|
|
text = re.sub(r'```(.*?)```', r' \1 ', text, flags=re.DOTALL)
|
|
|
|
|
|
# Remove inline code
|
|
|
text = re.sub(r'`(.*?)`', r' \1 ', text)
|
|
|
|
|
|
# Remove links
|
|
|
text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
|
|
|
|
|
|
return text.strip()
|
|
|
|
|
|
# 句号列表
|
|
|
stopList = ["。", "!", "?", ".", "!", "?"]
|
|
|
|
|
|
def getWikiSentences(html, line_max_length=150):
|
|
|
# Remove all new-line
|
|
|
html = html.replace("\r", "").replace("\n", "")
|
|
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
parserOutput = soup.find(class_="mw-parser-output")
|
|
|
|
|
|
# Remove <style>
|
|
|
for style in parserOutput.find_all("style"):
|
|
|
style.decompose()
|
|
|
|
|
|
# Remove <caption> if empty
|
|
|
for caption in parserOutput.find_all("caption"):
|
|
|
if caption.string is None or caption.string.strip() == "":
|
|
|
caption.decompose()
|
|
|
|
|
|
mdContent = MarkdownConverter(heading_style="ATX", newline_style="BACKSLASH").convert_soup(parserOutput)
|
|
|
|
|
|
titlePath = []
|
|
|
currentDepth = 0
|
|
|
|
|
|
docList = []
|
|
|
for line in mdContent.split("\n"):
|
|
|
line = line.strip()
|
|
|
if line == "":
|
|
|
continue
|
|
|
if line[0] == "#":
|
|
|
# Title
|
|
|
depth = 0
|
|
|
while line[depth] == "#":
|
|
|
depth += 1
|
|
|
if depth > currentDepth:
|
|
|
for i in range(depth - currentDepth):
|
|
|
titlePath.append(None)
|
|
|
titlePath.append(line[depth:].strip())
|
|
|
else:
|
|
|
titlePath = titlePath[:depth]
|
|
|
titlePath.append(line[depth:].strip())
|
|
|
currentDepth = depth
|
|
|
elif line[0] == "|" and line[-1] == "|":
|
|
|
# Table
|
|
|
continue
|
|
|
else:
|
|
|
if line[0] == ">":
|
|
|
# Quote
|
|
|
line = line[1:].strip()
|
|
|
# Sentence
|
|
|
titlePathStr = "/".join([title for title in titlePath if title is not None])
|
|
|
if titlePathStr == "":
|
|
|
titlePathStr = "简介"
|
|
|
|
|
|
# Split line by stops
|
|
|
sentences = re.split(r"([。!?.!?]+)", line)
|
|
|
|
|
|
combinedSentences = []
|
|
|
lineChunk = []
|
|
|
lineLen = 0
|
|
|
|
|
|
# Combine stops to previous sentence, and combine line untill line_max_length
|
|
|
for i in range(len(sentences) - 1):
|
|
|
if sentences[i] == "":
|
|
|
continue
|
|
|
if re.match(r"^[。!?.!?]+$", sentences[i + 1]):
|
|
|
sentences[i] += sentences[i + 1]
|
|
|
sentences[i + 1] = ""
|
|
|
|
|
|
lineLen += len(sentences[i])
|
|
|
if lineLen > line_max_length:
|
|
|
combinedSentences.append("".join(lineChunk))
|
|
|
lineChunk = []
|
|
|
lineLen = len(sentences[i])
|
|
|
lineChunk.append(sentences[i])
|
|
|
if len(lineChunk) > 0:
|
|
|
combinedSentences.append("".join(lineChunk))
|
|
|
|
|
|
# generate doc list
|
|
|
for text in combinedSentences:
|
|
|
stripedText = stripMarkdown(text)
|
|
|
|
|
|
stripedText = titlePathStr + ": " + stripedText
|
|
|
text = titlePathStr + ": " + text
|
|
|
|
|
|
# Strip markdown syntax
|
|
|
doc = {
|
|
|
"text": stripedText,
|
|
|
"markdown": text,
|
|
|
"text_len": len(stripedText),
|
|
|
"markdown_len": len(text),
|
|
|
}
|
|
|
docList.append(doc)
|
|
|
|
|
|
return docList |