|
|
|
<?php
|
|
|
|
|
|
|
|
namespace LatinizeUrl;
|
|
|
|
|
|
|
|
use Exception;
|
|
|
|
use Fukuball\Jieba\Jieba;
|
|
|
|
use Fukuball\Jieba\Finalseg;
|
|
|
|
use Fukuball\Jieba\Posseg;
|
|
|
|
use MediaWiki\MediaWikiServices;
|
|
|
|
use Overtrue\Pinyin\Pinyin;
|
|
|
|
use MediaWiki\Status\Status;
|
|
|
|
use FormatJson;
|
|
|
|
|
|
|
|
class ChineseConvertor extends BaseConvertor {
|
|
|
|
private $config;
|
|
|
|
private static $standalone = null;
|
|
|
|
private static $libLoaded = false;
|
|
|
|
private static $jiebaLoaded = false;
|
|
|
|
private static $pinyinParser = null;
|
|
|
|
|
|
|
|
public static function standalone() {
|
|
|
|
if (!self::$standalone) {
|
|
|
|
$service = MediaWikiServices::getInstance();
|
|
|
|
|
|
|
|
$config = $service->getMainConfig();
|
|
|
|
$wgLatinizeUrlChineseConvertorConfig = $config->get('LatinizeUrlChineseConvertorConfig');
|
|
|
|
|
|
|
|
self::$standalone = new self($wgLatinizeUrlChineseConvertorConfig);
|
|
|
|
}
|
|
|
|
return self::$standalone;
|
|
|
|
}
|
|
|
|
|
|
|
|
public static function onGetConvertor($langCode, &$convertor) {
|
|
|
|
if (in_array($langCode, ['zh-cn', 'zh-hans'])) {
|
|
|
|
$convertor = self::standalone();
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function __construct($config) {
|
|
|
|
$this->config = $config;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function parse($hanzi) {
|
|
|
|
$method = $this->config['parser'] . 'Parse';
|
|
|
|
|
|
|
|
if (is_callable([$this, $method])) {
|
|
|
|
return call_user_func([$this, $method], $hanzi);
|
|
|
|
} else {
|
|
|
|
throw new Exception('Cannot find pinyin parser: ' . $this->config['parser']);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private function filteJiebaTag($segList) {
|
|
|
|
$ret = [];
|
|
|
|
foreach ($segList as $seg) {
|
|
|
|
if ($seg['tag'] === 'uv' || $seg['tag'] === 'ud') { //介词
|
|
|
|
$index = count($ret) - 1;
|
|
|
|
$ret[$index] .= '的';
|
|
|
|
} else {
|
|
|
|
$ret[] = $seg['word'];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return $ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* 使用php内部方法实现汉字转拼音
|
|
|
|
*/
|
|
|
|
public function innerParse($hanzi) {
|
|
|
|
$ret = [];
|
|
|
|
if (!self::$libLoaded) {
|
|
|
|
require_once(dirname(__DIR__) . '/vendor/autoload.php');
|
|
|
|
self::$libLoaded = true;
|
|
|
|
}
|
|
|
|
$originalSentenceList = explode('/', $hanzi);
|
|
|
|
$sentenceList = [];
|
|
|
|
if (isset($this->config['cutWord']) && $this->config['cutWord']) { //需要分词
|
|
|
|
if (!self::$jiebaLoaded) {
|
|
|
|
ini_set('memory_limit', '1024M');
|
|
|
|
Jieba::init(['test' => true]);
|
|
|
|
Finalseg::init();
|
|
|
|
Posseg::init();
|
|
|
|
Jieba::loadUserDict(dirname(__DIR__) . '/data/userDict.txt');
|
|
|
|
self::$jiebaLoaded = true;
|
|
|
|
}
|
|
|
|
$length = count($originalSentenceList);
|
|
|
|
for ($i = 0; $i < $length; $i++) {
|
|
|
|
$sentence = $originalSentenceList[$i];
|
|
|
|
$sentenceList[] = $this->filteJiebaTag(Posseg::cut($sentence));
|
|
|
|
if ($i + 1 < $length) {
|
|
|
|
$sentenceList[] = '/';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
$length = count($originalSentenceList);
|
|
|
|
for ($i = 0; $i < $length; $i++) {
|
|
|
|
$sentence = $originalSentenceList[$i];
|
|
|
|
$sentenceList[] = [$sentence];
|
|
|
|
if ($i + 1 < $length) {
|
|
|
|
$sentenceList[] = '/';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
//分词后,进行拼音标注
|
|
|
|
if (!self::$pinyinParser) {
|
|
|
|
self::$pinyinParser = new Pinyin();
|
|
|
|
}
|
|
|
|
foreach ($sentenceList as $segList) {
|
|
|
|
if (is_array($segList)) {
|
|
|
|
$segPinyin = [];
|
|
|
|
foreach ($segList as $seg) {
|
|
|
|
$segPinyin[] = self::$pinyinParser->convert(
|
|
|
|
$seg,
|
|
|
|
PINYIN_NO_TONE | PINYIN_UMLAUT_V | PINYIN_KEEP_PUNCTUATION | PINYIN_KEEP_ENGLISH | PINYIN_KEEP_NUMBER
|
|
|
|
);
|
|
|
|
}
|
|
|
|
$ret[] = $segPinyin;
|
|
|
|
} else {
|
|
|
|
$ret[] = $segList;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return $ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* 使用hook进行汉字转拼音
|
|
|
|
*/
|
|
|
|
public function hookParse($hanzi) {
|
|
|
|
$pinyinList = null;
|
|
|
|
MediaWikiServices::getInstance()->getHookContainer()->run('Pinyin2Hanzi', [$hanzi, &$pinyinList]);
|
|
|
|
if (!$pinyinList) {
|
|
|
|
if (isset($this->config['fallback'])) {
|
|
|
|
return $this->parse($hanzi, $this->config['fallback']);
|
|
|
|
} else {
|
|
|
|
throw new Exception('Hook Pinyin2Hanzi never handled.');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private function fallbackOrException($hanzi, $message) {
|
|
|
|
if (isset($this->config['fallback']) && $this->config['fallback'] != false) {
|
|
|
|
return $this->parse($hanzi, $this->config['fallback']);
|
|
|
|
} else {
|
|
|
|
throw new Exception($message);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
public function apiParse($hanzi) {
|
|
|
|
if (!isset($this->config['url'])) {
|
|
|
|
throw new Exception('LatinizeUrl remote api url not set.');
|
|
|
|
}
|
|
|
|
$factory = MediaWikiServices::getInstance()->getHttpRequestFactory();
|
|
|
|
$req = $factory->create($this->config['url'], [
|
|
|
|
'method' => 'POST',
|
|
|
|
'postData' => [
|
|
|
|
'sentence' => $hanzi
|
|
|
|
],
|
|
|
|
], __METHOD__);
|
|
|
|
$status = Status::wrap($req->execute());
|
|
|
|
if (!$status->isOK()) {
|
|
|
|
$this->fallbackOrException($hanzi, 'Cannot use LatinizeUrl remote api.');
|
|
|
|
}
|
|
|
|
$json = FormatJson::decode($req->getContent(), true);
|
|
|
|
if (isset($json["error"])) {
|
|
|
|
$this->fallbackOrException($hanzi, 'LatinizeUrl remote api error: ' . $json["error"]);
|
|
|
|
}
|
|
|
|
if (!isset($json["status"]) || $json["status"] !== 1) {
|
|
|
|
$this->fallbackOrException($hanzi, 'Cannot use LatinizeUrl remote api.');
|
|
|
|
}
|
|
|
|
return $json["data"];
|
|
|
|
}
|
|
|
|
}
|