isekai-toolkit/utils/wiki.py

import re
from bs4 import BeautifulSoup
from markdownify import MarkdownConverter

def stripMarkdown(text):
    # Remove headers
    text = re.sub(r'#+(\s+)', r'\1', text)

    # Remove emphasis
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
    text = re.sub(r'\*(.*?)\*', r'\1', text)
    text = re.sub(r'__(.*?)__', r'\1', text)
    text = re.sub(r'_(.*?)_', r'\1', text)

    # Remove code blocks
    text = re.sub(r'```(.*?)```', r' \1 ', text, flags=re.DOTALL)

    # Remove inline code
    text = re.sub(r'`(.*?)`', r' \1 ', text)

    # Remove links
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)

    return text.strip()

# 句号列表
stopList = ["。", "！", "？", ".", "!", "?"]

def getWikiSentences(html, line_max_length=150):
    # Remove all new-line
    html = html.replace("\r", "").replace("\n", "")

    soup = BeautifulSoup(html, 'html.parser')
    parserOutput = soup.find(class_="mw-parser-output")

    # Remove <style>
    for style in parserOutput.find_all("style"):
        style.decompose()

    # Remove <caption> if empty
    for caption in parserOutput.find_all("caption"):
        if caption.string is None or caption.string.strip() == "":
            caption.decompose()

    mdContent = MarkdownConverter(heading_style="ATX", newline_style="BACKSLASH").convert_soup(parserOutput)

    titlePath = []
    currentDepth = 0

    docList = []
    for line in mdContent.split("\n"):
        line = line.strip()
        if line == "":
            continue
        if line[0] == "#":
            # Title
            depth = 0
            while line[depth] == "#":
                depth += 1
            if depth > currentDepth:
                for i in range(depth - currentDepth):
                    titlePath.append(None)
                titlePath.append(line[depth:].strip())
            else:
                titlePath = titlePath[:depth]
                titlePath.append(line[depth:].strip())
            currentDepth = depth
        elif line[0] == "|" and line[-1] == "|":
            # Table
            continue
        else:
            if line[0] == ">":
                # Quote
                line = line[1:].strip()
            # Sentence
            titlePathStr = "/".join([title for title in titlePath if title is not None])
            if titlePathStr == "":
                titlePathStr = "简介"

            # Split line by stops
            sentences = re.split(r"([。！？.!?]+)", line)

            combinedSentences = []
            lineChunk = []
            lineLen = 0

            # Combine stops to previous sentence, and combine line untill line_max_length
            for i in range(len(sentences) - 1):
                if sentences[i] == "":
                    continue
                if re.match(r"^[。！？.!?]+$", sentences[i + 1]):
                    sentences[i] += sentences[i + 1]
                    sentences[i + 1] = ""

                lineLen += len(sentences[i])
                if lineLen > line_max_length:
                    combinedSentences.append("".join(lineChunk))
                    lineChunk = []
                    lineLen = len(sentences[i])
                lineChunk.append(sentences[i])
            if len(lineChunk) > 0:
                combinedSentences.append("".join(lineChunk))

            # generate doc list
            for text in combinedSentences:
                stripedText = stripMarkdown(text)

                stripedText = titlePathStr + ": " + stripedText
                text = titlePathStr + ": " + text

                # Strip markdown syntax
                doc = {
                    "text": stripedText,
                    "markdown": text,
                    "text_len": len(stripedText),
                    "markdown_len": len(text),
                }
                docList.append(doc)
    
    return docList