You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

285 lines
6.6 KiB
PHP

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<?php
use Fukuball\Jieba\Jieba;
use Fukuball\Jieba\Finalseg;
use Fukuball\Jieba\JiebaAnalyse;
use Fukuball\Jieba\Posseg;
class JiebaTest extends PHPUnit_Framework_TestCase
{
public function testJiebaInit()
{
Jieba::init();
$this->assertGreaterThan(0, Jieba::$total);
}
public function testFinalsegInit()
{
Finalseg::init();
$array_count = count(Finalseg::$prob_start);
$this->assertEquals(4, $array_count);
}
public function testJiebaAnalyseInit()
{
Jieba::init();
JiebaAnalyse::init();
$this->assertGreaterThan(0, JiebaAnalyse::$max_idf);
}
public function testPossegInit()
{
Posseg::init();
$array_count = count(Posseg::$prob_start);
$this->assertEquals(256, $array_count);
}
public function testJiebaCut()
{
$case_array = array(
"怜香惜玉",
"",
"",
"",
"",
"对象",
"",
""
);
$seg_list = Jieba::cut("怜香惜玉也得要看对象啊!");
$this->assertEquals($case_array, $seg_list);
$case_array = array(
"",
"来到",
"北京",
"清华大学"
);
$seg_list = Jieba::cut("我来到北京清华大学");
$this->assertEquals($case_array, $seg_list);
$case_array = array(
"",
"来到",
"",
"网易",
"杭研",
"大厦"
);
$seg_list = Jieba::cut("他来到了网易杭研大厦");
$this->assertEquals($case_array, $seg_list);
}
public function testJiebaCutAll()
{
$case_array = array(
"",
"来到",
"北京",
"清华",
"清华大学",
"华大",
"大学"
);
$seg_list = Jieba::cut("我来到北京清华大学", true);
$this->assertEquals($case_array, $seg_list);
}
public function testJiebaCutForSearch()
{
$case_array = array(
"",
"",
"硕士",
"毕业",
"",
"中国",
"科学",
"学院",
"科学院",
"中国科学院",
"计算",
"计算所",
"",
"",
"",
"日本",
"京都",
"大学",
"日本京都大学",
"深造"
);
$seg_list = Jieba::cutForSEarch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造");
$this->assertEquals($case_array, $seg_list);
}
public function testFinalsegCut()
{
$case_array = array(
"怜香惜",
"",
"",
"",
"",
"",
"对象",
""
);
$seg_list = Finalseg::cut("怜香惜玉也得要看对象啊!");
$this->assertEquals($case_array, $seg_list);
}
public function testExtractTags()
{
$case_array = array(
"所謂"=>1.0102620424985915,
"是否"=>0.7386504806253521,
"一般"=>0.60759968349154936,
"沒有"=>0.33675401416619716,
"肌迫"=>0.33675401416619716,
"雖然"=>0.33675401416619716,
"退縮"=>0.33675401416619716,
"矯作"=>0.33675401416619716,
"怯懦"=>0.27109891642140843
);
$top_k = 9;
$content = file_get_contents(dirname(dirname(__FILE__))."/src/dict/lyric.txt", "r");
$tags = JiebaAnalyse::extractTags($content, $top_k);
$this->assertEquals($case_array, $tags);
}
public function testLoadUserDict()
{
$case_array = array(
"李小福",
"",
"创新办",
"主任",
"",
"",
"云计算",
"方面",
"",
"专家"
);
Jieba::loadUserDict(dirname(dirname(__FILE__)).'/src/dict/user_dict.txt');
$seg_list = Jieba::cut("李小福是创新办主任也是云计算方面的专家");
$this->assertEquals($case_array, $seg_list);
}
public function testPossegCut()
{
$case_array = array(
array(
"word" => "",
"tag" => "r"
),
array(
"word" => "",
"tag" => "v"
),
array(
"word" => "一个",
"tag" => "m"
),
array(
"word" => "伸手不见五指",
"tag" => "i"
),
array(
"word" => "",
"tag" => "uj"
),
array(
"word" => "黑夜",
"tag" => "n"
),
array(
"word" => "",
"tag" => "w"
),
array(
"word" => "",
"tag" => "r"
),
array(
"word" => "",
"tag" => "v"
),
array(
"word" => "孙悟空",
"tag" => "nr"
),
array(
"word" => "",
"tag" => "w"
),
array(
"word" => "",
"tag" => "r"
),
array(
"word" => "",
"tag" => "v"
),
array(
"word" => "北京",
"tag" => "ns"
),
array(
"word" => "",
"tag" => "w"
),
array(
"word" => "",
"tag" => "r"
),
array(
"word" => "",
"tag" => "v"
),
array(
"word" => "Python",
"tag" => "eng"
),
array(
"word" => "",
"tag" => "c"
),
array(
"word" => "C++",
"tag" => "eng"
),
array(
"word" => "",
"tag" => "w"
)
);
$seg_list = Posseg::cut("这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。");
$this->assertEquals($case_array, $seg_list);
}
}