PHP 版余弦定理处理文本相似度 + 分词

<?php
/*
*   文本相似度(余弦定理)
*   
*   Author:宋小北(@xiaobeicn)
*   
*   参考:
*   http://www.ruanyifeng.com/blog/2013/03/cosine_similarity.html 
*       http://my.oschina.net/BreathL/blog/42477
*   
*   Use:
*   $obj = new TextSimilarity ($text1, $text2);
*   echo $obj->run();
*/

Class TextSimilarity {
    /**
     * [排除的词语]
     *
     * @var array
     */
    private $_excludeArr = array('的','了','和','呢','啊','哦','恩','嗯','吧');
    
    /**
     * [词语分布数组]
     *
     * @var array
     */
    private $_words = array();
    
    /**
     * [分词后的数组一]
     *
     * @var array
     */
    private $_segList1 = array();
    
    /**
     * [分词后的数组二]
     *
     * @var array
     */
    private $_segList2 = array();
    
    /**
     * [分词两段文字]
     *
     * @param [type] $text1 [description]
     * @param [type] $text2 [description]
     */
    public function __construct($text1, $text2)
    {
        $this->_segList1 = $this->segment($text1);
        $this->_segList2 = $this->segment($text2);
    }
    
    /**
     * [外部调用]
     *
     * @return [type] [description]
     */
    public function run()
    {
        $this->analyse();
        $rate = $this->handle();
        return $rate ? $rate : 'errors';
    }
    
    /**
     * [分析两段文字]
     */
    private function analyse()
    {
        //t1
        foreach($this->_segList1 as $v){
            if( !in_array($v , $this->_excludeArr) ){
                if( !array_key_exists($v , $this->_words) ){
                    $this->_words[$v] = array(1 , 0);
                }else{
                    $this->_words[$v][0] += 1;
                }
            }
        }
        
        //t2
        foreach($this->_segList2 as $v){
            if( !in_array($v , $this->_excludeArr) ){
                if( !array_key_exists($v , $this->_words) ){
                    $this->_words[$v] = array(0 , 1);
                }else{
                    $this->_words[$v][1] += 1;
                }
            }
        }
    }
    
    /**
     * [处理相似度]
     *
     * @return [type] [description]
     */
    private function handle()
    {
        $sum = $sumT1 = $sumT2 = 0;
        foreach($this->_words as $word){
            $sum    += $word[0] * $word[1];
            $sumT1  += pow($word[0],2);
            $sumT2  += pow($word[1],2);
        }
        
        $rate = $sum / (sqrt($sumT1 * $sumT2));
        return $rate;
    }
    
    /**
     * [分词  【http://www.xunsearch.com/scws/docs.php#pscws23】]
     *
     * @param [type] $text [description]
     *
     * @return [type] [description]
     *
     * @description 分词只是一个简单的例子,你可以使用任意的分词服务
     */
    private function segment($text)
    {
        $outText = array();
        //实例化
        $so = scws_new();
        //字符集
        $so->set_charset('utf8');
        //处理
        $so->send_text($text);
        
        //便利出需要的数组
        while($res = $so->get_result()){
            foreach($res as $v){
                $outText[] = $v['word'];
            }
        }
        //关闭
        $so->close();
        
        return $outText;
    }
    
}

本文转载自:http://blog.libnav.com/php/131.html