You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

174 lines
5.7 KiB
PHP

<?php
namespace LatinizeUrl;
use Exception;
use Fukuball\Jieba\Jieba;
use Fukuball\Jieba\Finalseg;
use Fukuball\Jieba\Posseg;
use MediaWiki\MediaWikiServices;
use Overtrue\Pinyin\Pinyin;
use MediaWiki\Status\Status;
use FormatJson;
class ChineseConvertor extends BaseConvertor {
private $config;
private static $standalone = null;
private static $libLoaded = false;
private static $jiebaLoaded = false;
private static $pinyinParser = null;
public static function standalone() {
if (!self::$standalone) {
$service = MediaWikiServices::getInstance();
$config = $service->getMainConfig();
$wgLatinizeUrlChineseConvertorConfig = $config->get('LatinizeUrlChineseConvertorConfig');
self::$standalone = new self($wgLatinizeUrlChineseConvertorConfig);
}
return self::$standalone;
}
public static function onGetConvertor($langCode, &$convertor) {
if (in_array($langCode, ['zh-cn', 'zh-hans'])) {
$convertor = self::standalone();
}
return true;
}
public function __construct($config) {
$this->config = $config;
}
public function parse($hanzi) {
$method = $this->config['parser'] . 'Parse';
if (is_callable([$this, $method])) {
return call_user_func([$this, $method], $hanzi);
} else {
throw new Exception('Cannot find pinyin parser: ' . $this->config['parser']);
}
}
private function filteJiebaTag($segList) {
$ret = [];
foreach ($segList as $seg) {
if ($seg['tag'] === 'uv' || $seg['tag'] === 'ud') { //介词
$index = count($ret) - 1;
$ret[$index] .= '的';
} else {
$ret[] = $seg['word'];
}
}
return $ret;
}
/**
* 使用php内部方法实现汉字转拼音
*/
public function innerParse($hanzi) {
$ret = [];
if (!self::$libLoaded) {
require_once(dirname(__DIR__) . '/vendor/autoload.php');
self::$libLoaded = true;
}
$originalSentenceList = explode('/', $hanzi);
$sentenceList = [];
if (isset($this->config['cutWord']) && $this->config['cutWord']) { //需要分词
if (!self::$jiebaLoaded) {
ini_set('memory_limit', '1024M');
Jieba::init(['test' => true]);
Finalseg::init();
Posseg::init();
Jieba::loadUserDict(dirname(__DIR__) . '/data/userDict.txt');
self::$jiebaLoaded = true;
}
$length = count($originalSentenceList);
for ($i = 0; $i < $length; $i++) {
$sentence = $originalSentenceList[$i];
$sentenceList[] = $this->filteJiebaTag(Posseg::cut($sentence));
if ($i + 1 < $length) {
$sentenceList[] = '/';
}
}
} else {
$length = count($originalSentenceList);
for ($i = 0; $i < $length; $i++) {
$sentence = $originalSentenceList[$i];
$sentenceList[] = [$sentence];
if ($i + 1 < $length) {
$sentenceList[] = '/';
}
}
}
//分词后,进行拼音标注
if (!self::$pinyinParser) {
self::$pinyinParser = new Pinyin();
}
foreach ($sentenceList as $segList) {
if (is_array($segList)) {
$segPinyin = [];
foreach ($segList as $seg) {
$segPinyin[] = self::$pinyinParser->convert(
$seg,
PINYIN_NO_TONE | PINYIN_UMLAUT_V | PINYIN_KEEP_PUNCTUATION | PINYIN_KEEP_ENGLISH | PINYIN_KEEP_NUMBER
);
}
$ret[] = $segPinyin;
} else {
$ret[] = $segList;
}
}
return $ret;
}
/**
* 使用hook进行汉字转拼音
*/
public function hookParse($hanzi) {
$pinyinList = null;
MediaWikiServices::getInstance()->getHookContainer()->run('Pinyin2Hanzi', [$hanzi, &$pinyinList]);
if (!$pinyinList) {
if (isset($this->config['fallback'])) {
return $this->parse($hanzi, $this->config['fallback']);
} else {
throw new Exception('Hook Pinyin2Hanzi never handled.');
}
}
}
private function fallbackOrException($hanzi, $message) {
if (isset($this->config['fallback']) && $this->config['fallback'] != false) {
return $this->parse($hanzi, $this->config['fallback']);
} else {
throw new Exception($message);
}
}
public function apiParse($hanzi) {
if (!isset($this->config['url'])) {
throw new Exception('LatinizeUrl remote api url not set.');
}
$factory = MediaWikiServices::getInstance()->getHttpRequestFactory();
$req = $factory->create($this->config['url'], [
'method' => 'POST',
'postData' => [
'sentence' => $hanzi
],
], __METHOD__);
$status = Status::wrap($req->execute());
if (!$status->isOK()) {
$this->fallbackOrException($hanzi, 'Cannot use LatinizeUrl remote api.');
}
$json = FormatJson::decode($req->getContent(), true);
if (isset($json["error"])) {
$this->fallbackOrException($hanzi, 'LatinizeUrl remote api error: ' . $json["error"]);
}
if (!isset($json["status"]) || $json["status"] !== 1) {
$this->fallbackOrException($hanzi, 'Cannot use LatinizeUrl remote api.');
}
return $json["data"];
}
}