You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

72 lines
1.7 KiB
Python

import re
def isAscii(inputStr):
return bool(re.match(r"^[\x00-\xff]+$", inputStr))
def isAsciiPunc(inputStr):
return bool(re.match(r"^[\x20-\x2f\x3a-\x40\x5b-\x60]+$", inputStr))
def isAsciiChar(char):
return ord(char) <= 255
def isAsciiPuncChar(char):
charCode = ord(char)
if 0x20 <= charCode <= 0x2f or 0x3a <= charCode <= 0x40 or 0x5b <= charCode <= 0x60:
return True
else:
return False
class CHARTYPE:
ASCII = 0
ASCII_PUNC = 1
UNICODE = 2
def getCharType(char):
if isAsciiChar(char):
if isAsciiPuncChar(char):
return CHARTYPE.ASCII_PUNC
else:
return CHARTYPE.ASCII
else:
return CHARTYPE.UNICODE
def replaceCJKPunc(string):
table = {ord(f): ord(t) for f, t in zip(
u',。!?【】()《》%#@&·1234567890',
u',.!?[]() %#@& 1234567890')}
return string.translate(table)
def splitAscii(string):
if len(string) == 0:
return string
string = replaceCJKPunc(string)
lastCharType = getCharType(string[0])
segList = []
startPos = 0
endPos = 0
buffer = []
for char in string:
if char == " ":
if endPos > startPos:
segList.append(string[startPos:endPos])
startPos = endPos + 1
else:
currentCharType = getCharType(char)
if lastCharType != currentCharType:
if endPos > startPos:
segList.append(string[startPos:endPos])
startPos = endPos
lastCharType = currentCharType
endPos += 1
if endPos > startPos:
segList.append(string[startPos:])
return segList