You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

120 lines
3.8 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import re
from bs4 import BeautifulSoup
from markdownify import MarkdownConverter
def stripMarkdown(text):
# Remove headers
text = re.sub(r'#+(\s+)', r'\1', text)
# Remove emphasis
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
text = re.sub(r'\*(.*?)\*', r'\1', text)
text = re.sub(r'__(.*?)__', r'\1', text)
text = re.sub(r'_(.*?)_', r'\1', text)
# Remove code blocks
text = re.sub(r'```(.*?)```', r' \1 ', text, flags=re.DOTALL)
# Remove inline code
text = re.sub(r'`(.*?)`', r' \1 ', text)
# Remove links
text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
return text.strip()
# 句号列表
stopList = ["", "", "", ".", "!", "?"]
def getWikiSentences(html, line_max_length=150):
# Remove all new-line
html = html.replace("\r", "").replace("\n", "")
soup = BeautifulSoup(html, 'html.parser')
parserOutput = soup.find(class_="mw-parser-output")
# Remove <style>
for style in parserOutput.find_all("style"):
style.decompose()
# Remove <caption> if empty
for caption in parserOutput.find_all("caption"):
if caption.string is None or caption.string.strip() == "":
caption.decompose()
mdContent = MarkdownConverter(heading_style="ATX", newline_style="BACKSLASH").convert_soup(parserOutput)
titlePath = []
currentDepth = 0
docList = []
for line in mdContent.split("\n"):
line = line.strip()
if line == "":
continue
if line[0] == "#":
# Title
depth = 0
while line[depth] == "#":
depth += 1
if depth > currentDepth:
for i in range(depth - currentDepth):
titlePath.append(None)
titlePath.append(line[depth:].strip())
else:
titlePath = titlePath[:depth]
titlePath.append(line[depth:].strip())
currentDepth = depth
elif line[0] == "|" and line[-1] == "|":
# Table
continue
else:
if line[0] == ">":
# Quote
line = line[1:].strip()
# Sentence
titlePathStr = "/".join([title for title in titlePath if title is not None])
if titlePathStr == "":
titlePathStr = "简介"
# Split line by stops
sentences = re.split(r"([。!?.!?]+)", line)
combinedSentences = []
lineChunk = []
lineLen = 0
# Combine stops to previous sentence, and combine line untill line_max_length
for i in range(len(sentences) - 1):
if sentences[i] == "":
continue
if re.match(r"^[。!?.!?]+$", sentences[i + 1]):
sentences[i] += sentences[i + 1]
sentences[i + 1] = ""
lineLen += len(sentences[i])
if lineLen > line_max_length:
combinedSentences.append("".join(lineChunk))
lineChunk = []
lineLen = len(sentences[i])
lineChunk.append(sentences[i])
if len(lineChunk) > 0:
combinedSentences.append("".join(lineChunk))
# generate doc list
for text in combinedSentences:
stripedText = stripMarkdown(text)
stripedText = titlePathStr + ": " + stripedText
text = titlePathStr + ": " + text
# Strip markdown syntax
doc = {
"text": stripedText,
"markdown": text,
"text_len": len(stripedText),
"markdown_len": len(text),
}
docList.append(doc)
return docList