|
|
<?php
|
|
|
use Fukuball\Jieba\Jieba;
|
|
|
use Fukuball\Jieba\Finalseg;
|
|
|
use Fukuball\Jieba\JiebaAnalyse;
|
|
|
use Fukuball\Jieba\Posseg;
|
|
|
|
|
|
class JiebaTest extends PHPUnit_Framework_TestCase
|
|
|
{
|
|
|
|
|
|
public function testJiebaInit()
|
|
|
{
|
|
|
Jieba::init();
|
|
|
$this->assertGreaterThan(0, Jieba::$total);
|
|
|
}
|
|
|
|
|
|
public function testFinalsegInit()
|
|
|
{
|
|
|
Finalseg::init();
|
|
|
$array_count = count(Finalseg::$prob_start);
|
|
|
$this->assertEquals(4, $array_count);
|
|
|
}
|
|
|
|
|
|
public function testJiebaAnalyseInit()
|
|
|
{
|
|
|
Jieba::init();
|
|
|
JiebaAnalyse::init();
|
|
|
$this->assertGreaterThan(0, JiebaAnalyse::$max_idf);
|
|
|
|
|
|
}
|
|
|
|
|
|
public function testPossegInit()
|
|
|
{
|
|
|
Posseg::init();
|
|
|
$array_count = count(Posseg::$prob_start);
|
|
|
$this->assertEquals(256, $array_count);
|
|
|
}
|
|
|
|
|
|
public function testJiebaCut()
|
|
|
{
|
|
|
$case_array = array(
|
|
|
"怜香惜玉",
|
|
|
"也",
|
|
|
"得",
|
|
|
"要",
|
|
|
"看",
|
|
|
"对象",
|
|
|
"啊",
|
|
|
"!"
|
|
|
);
|
|
|
|
|
|
$seg_list = Jieba::cut("怜香惜玉也得要看对象啊!");
|
|
|
$this->assertEquals($case_array, $seg_list);
|
|
|
|
|
|
$case_array = array(
|
|
|
"我",
|
|
|
"来到",
|
|
|
"北京",
|
|
|
"清华大学"
|
|
|
);
|
|
|
|
|
|
$seg_list = Jieba::cut("我来到北京清华大学");
|
|
|
$this->assertEquals($case_array, $seg_list);
|
|
|
|
|
|
$case_array = array(
|
|
|
"他",
|
|
|
"来到",
|
|
|
"了",
|
|
|
"网易",
|
|
|
"杭研",
|
|
|
"大厦"
|
|
|
);
|
|
|
|
|
|
$seg_list = Jieba::cut("他来到了网易杭研大厦");
|
|
|
$this->assertEquals($case_array, $seg_list);
|
|
|
|
|
|
}
|
|
|
|
|
|
public function testJiebaCutAll()
|
|
|
{
|
|
|
|
|
|
$case_array = array(
|
|
|
"我",
|
|
|
"来到",
|
|
|
"北京",
|
|
|
"清华",
|
|
|
"清华大学",
|
|
|
"华大",
|
|
|
"大学"
|
|
|
);
|
|
|
|
|
|
$seg_list = Jieba::cut("我来到北京清华大学", true);
|
|
|
$this->assertEquals($case_array, $seg_list);
|
|
|
|
|
|
}
|
|
|
|
|
|
public function testJiebaCutForSearch()
|
|
|
{
|
|
|
$case_array = array(
|
|
|
"小",
|
|
|
"明",
|
|
|
"硕士",
|
|
|
"毕业",
|
|
|
"于",
|
|
|
"中国",
|
|
|
"科学",
|
|
|
"学院",
|
|
|
"科学院",
|
|
|
"中国科学院",
|
|
|
"计算",
|
|
|
"计算所",
|
|
|
",",
|
|
|
"后",
|
|
|
"在",
|
|
|
"日本",
|
|
|
"京都",
|
|
|
"大学",
|
|
|
"日本京都大学",
|
|
|
"深造"
|
|
|
);
|
|
|
|
|
|
$seg_list = Jieba::cutForSEarch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造");
|
|
|
$this->assertEquals($case_array, $seg_list);
|
|
|
|
|
|
}
|
|
|
|
|
|
public function testFinalsegCut()
|
|
|
{
|
|
|
$case_array = array(
|
|
|
"怜香惜",
|
|
|
"玉",
|
|
|
"也",
|
|
|
"得",
|
|
|
"要",
|
|
|
"看",
|
|
|
"对象",
|
|
|
"啊"
|
|
|
);
|
|
|
|
|
|
$seg_list = Finalseg::cut("怜香惜玉也得要看对象啊!");
|
|
|
$this->assertEquals($case_array, $seg_list);
|
|
|
}
|
|
|
|
|
|
public function testExtractTags()
|
|
|
{
|
|
|
$case_array = array(
|
|
|
"所謂"=>1.0102620424985915,
|
|
|
"是否"=>0.7386504806253521,
|
|
|
"一般"=>0.60759968349154936,
|
|
|
"沒有"=>0.33675401416619716,
|
|
|
"肌迫"=>0.33675401416619716,
|
|
|
"雖然"=>0.33675401416619716,
|
|
|
"退縮"=>0.33675401416619716,
|
|
|
"矯作"=>0.33675401416619716,
|
|
|
"怯懦"=>0.27109891642140843
|
|
|
);
|
|
|
|
|
|
$top_k = 9;
|
|
|
$content = file_get_contents(dirname(dirname(__FILE__))."/src/dict/lyric.txt", "r");
|
|
|
|
|
|
$tags = JiebaAnalyse::extractTags($content, $top_k);
|
|
|
$this->assertEquals($case_array, $tags);
|
|
|
}
|
|
|
|
|
|
public function testLoadUserDict()
|
|
|
{
|
|
|
|
|
|
$case_array = array(
|
|
|
"李小福",
|
|
|
"是",
|
|
|
"创新办",
|
|
|
"主任",
|
|
|
"也",
|
|
|
"是",
|
|
|
"云计算",
|
|
|
"方面",
|
|
|
"的",
|
|
|
"专家"
|
|
|
);
|
|
|
|
|
|
Jieba::loadUserDict(dirname(dirname(__FILE__)).'/src/dict/user_dict.txt');
|
|
|
|
|
|
$seg_list = Jieba::cut("李小福是创新办主任也是云计算方面的专家");
|
|
|
$this->assertEquals($case_array, $seg_list);
|
|
|
|
|
|
}
|
|
|
|
|
|
public function testPossegCut()
|
|
|
{
|
|
|
|
|
|
|
|
|
$case_array = array(
|
|
|
array(
|
|
|
"word" => "这",
|
|
|
"tag" => "r"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "是",
|
|
|
"tag" => "v"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "一个",
|
|
|
"tag" => "m"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "伸手不见五指",
|
|
|
"tag" => "i"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "的",
|
|
|
"tag" => "uj"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "黑夜",
|
|
|
"tag" => "n"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "。",
|
|
|
"tag" => "w"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "我",
|
|
|
"tag" => "r"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "叫",
|
|
|
"tag" => "v"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "孙悟空",
|
|
|
"tag" => "nr"
|
|
|
),
|
|
|
array(
|
|
|
"word" => ",",
|
|
|
"tag" => "w"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "我",
|
|
|
"tag" => "r"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "爱",
|
|
|
"tag" => "v"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "北京",
|
|
|
"tag" => "ns"
|
|
|
),
|
|
|
array(
|
|
|
"word" => ",",
|
|
|
"tag" => "w"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "我",
|
|
|
"tag" => "r"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "爱",
|
|
|
"tag" => "v"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "Python",
|
|
|
"tag" => "eng"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "和",
|
|
|
"tag" => "c"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "C++",
|
|
|
"tag" => "eng"
|
|
|
),
|
|
|
array(
|
|
|
"word" => "。",
|
|
|
"tag" => "w"
|
|
|
)
|
|
|
);
|
|
|
|
|
|
$seg_list = Posseg::cut("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。");
|
|
|
|
|
|
$this->assertEquals($case_array, $seg_list);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|