锘??xml version="1.0" encoding="utf-8" standalone="yes"?>久久人人爽人人人人片av,久久亚洲综合色一区二区三区,91久久精品电影http://m.shnenglu.com/jrckkyy/category/12532.html閲戣瀺鏁板,InformationSearch,Compiler,OS,zh-cnThu, 10 Dec 2009 15:07:47 GMTThu, 10 Dec 2009 15:07:47 GMT60鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙?qiáng)瀹屽叏娉ㄩ噴[6]鍊掓帓绱㈠紩鐨勫緩绔嬬殑紼嬪簭鍒嗘瀽(4)http://m.shnenglu.com/jrckkyy/archive/2009/12/10/102949.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Thu, 10 Dec 2009 15:03:00 GMThttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102949.htmlhttp://m.shnenglu.com/jrckkyy/comments/102949.htmlhttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102949.html#Feedback0http://m.shnenglu.com/jrckkyy/comments/commentRss/102949.htmlhttp://m.shnenglu.com/jrckkyy/services/trackbacks/102949.html浠ヤ笅鏄牴鎹鍚戠儲(chǔ)寮曞緩绔嬪掓帓绱㈠紩鐨勬敞閲?/p>

 

int main(int argc, char* argv[])    //./CrtInvertedIdx moon.fidx.sort > sun.iidx
{
    ifstream ifsImgInfo(argv[1]);
    if (!ifsImgInfo) 
    {
        cerr << "Cannot open " << argv[1] << " for input\n";
        return -1;
    }

    string strLine,strDocNum,tmp1="";
    int cnt = 0;
    while (getline(ifsImgInfo, strLine)) 
    {
        string::size_type idx;
        string tmp;


        idx = strLine.find("\t");
        tmp = strLine.substr(0,idx);

        if (tmp.size()<2 || tmp.size() > 8) continue;

        if (tmp1.empty()) tmp1=tmp;

        if (tmp == tmp1) 
        {
            strDocNum = strDocNum + " " + strLine.substr(idx+1);
        }
        else 
        {
            if ( strDocNum.empty() )
                strDocNum = strDocNum + " " + strLine.substr(idx+1);

            cout << tmp1 << "\t" << strDocNum << endl;
            tmp1 = tmp;
            strDocNum.clear();
            strDocNum = strDocNum + " " + strLine.substr(idx+1);
        }

        cnt++;
        //if (cnt==100) break;
    }
    cout << tmp1 << "\t" << strDocNum << endl;  //鍊掓帓绱㈠紩涓瘡涓瓧鍏稿崟璇嶅悗鐨勬枃妗g紪鍙蜂互table閿負(fù)闂撮殧

    return 0;
}

 

 



]]>
鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙?qiáng)瀹屽叏娉ㄩ噴[6]鍊掓帓绱㈠紩鐨勫緩绔嬬殑紼嬪簭鍒嗘瀽(2)http://m.shnenglu.com/jrckkyy/archive/2009/12/10/102947.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Thu, 10 Dec 2009 15:02:00 GMThttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102947.htmlhttp://m.shnenglu.com/jrckkyy/comments/102947.htmlhttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102947.html#Feedback0http://m.shnenglu.com/jrckkyy/comments/commentRss/102947.htmlhttp://m.shnenglu.com/jrckkyy/services/trackbacks/102947.html鍓嶉潰鐨凞ocIndex紼嬪簭杈撳叆涓涓猅ianwang.raw.*****鏂囦歡錛屼細(xì)浜х敓涓涓嬩笁涓枃浠?Doc.idx, Url.idx, DocId2Url.idx錛屾垜浠繖閲屽DocSegment紼嬪簭榪涜鍒嗘瀽銆?/p>

榪欓噷杈撳叆 Tianwang.raw.*****錛孌oc.idx錛孶rl.idx.sort_uniq絳変笁涓枃浠訛紝杈撳嚭涓涓猅ianwang.raw.***.seg 鍒嗚瘝瀹屾瘯鐨勬枃浠?/p>

int main(int argc, char* argv[])
{
    string strLine, strFileName=argv[1];
    CUrl iUrl;
    vector<CUrl> vecCUrl;
    CDocument iDocument;
    vector<CDocument> vecCDocument;
    unsigned int docId = 0;

    //ifstream ifs("Tianwang.raw.2559638448");
    ifstream ifs(strFileName.c_str());  //DocSegment Tianwang.raw.****
    if (!ifs) 
    {
        cerr << "Cannot open tianwang.img.info for input\n";
        return -1;
    }

    ifstream ifsUrl("Url.idx.sort_uniq");   //鎺掑簭騫舵秷閲嶅悗鐨剈rl瀛楀吀
    if (!ifsUrl) 
    {
        cerr << "Cannot open Url.idx.sort_uniq for input\n";
        return -1;
    }
    ifstream ifsDoc("Doc.idx"); //瀛楀吀鏂囦歡
    if (!ifsDoc) 
    {
        cerr << "Cannot open Doc.idx for input\n";
        return -1;
    }

    while (getline(ifsUrl,strLine)) //鍋忕url瀛楀吀瀛樺叆涓涓悜閲忓唴瀛樹腑
    {
        char chksum[33];
        int  docid;

        memset(chksum, 0, 33);
        sscanf( strLine.c_str(), "%s%d", chksum, &docid );
        iUrl.m_sChecksum = chksum;
        iUrl.m_nDocId = docid;
        vecCUrl.push_back(iUrl);
    }

    while (getline(ifsDoc,strLine))     //鍋忕瀛楀吀鏂囦歡灝嗗叾鏀懼叆涓涓悜閲忓唴瀛樹腑
    {
        int docid,pos,length;
        char chksum[33];

        memset(chksum, 0, 33);
        sscanf( strLine.c_str(), "%d%d%d%s", &docid, &pos, &length,chksum );
        iDocument.m_nDocId = docid;
        iDocument.m_nPos = pos;
        iDocument.m_nLength = length;
        iDocument.m_sChecksum = chksum;
        vecCDocument.push_back(iDocument);
    }

 

    strFileName += ".seg";
    ofstream fout(strFileName.c_str(), ios::in|ios::out|ios::trunc|ios::binary);    //璁劇疆瀹屾垚鍒嗚瘝鍚庣殑鏁版嵁杈撳嚭鏂囦歡
    for ( docId=0; docId<MAX_DOC_ID; docId++ )
    {

        // find document according to docId
        int length = vecCDocument[docId+1].m_nPos - vecCDocument[docId].m_nPos -1;
        char *pContent = new char[length+1];
        memset(pContent, 0, length+1);
        ifs.seekg(vecCDocument[docId].m_nPos);
        ifs.read(pContent, length);

        char *s;
        s = pContent;

        // skip Head
        int bytesRead = 0,newlines = 0;
        while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1) 
        {
            if (*s == '\n')
                newlines++;
            else
                newlines = 0;
            s++;
            bytesRead++;
        }
        if (bytesRead == HEADER_BUF_SIZE-1) continue;


        // skip header
        bytesRead = 0,newlines = 0;
        while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1) 
        {
            if (*s == '\n')
                newlines++;
            else
                newlines = 0;
            s++;
            bytesRead++;
        }
        if (bytesRead == HEADER_BUF_SIZE-1) continue;

        //iDocument.m_sBody = s;
        iDocument.RemoveTags(s);    //鍘婚櫎<>
        iDocument.m_sBodyNoTags = s;

        delete[] pContent;
        string strLine = iDocument.m_sBodyNoTags;

        CStrFun::ReplaceStr(strLine, " ", " ");
        CStrFun::EmptyStr(strLine); // set " \t\r\n" to " "


        // segment the document 鍏蜂綋鍒嗚瘝澶勭悊
        CHzSeg iHzSeg;
        strLine = iHzSeg.SegmentSentenceMM(iDict,strLine);
        fout << docId << endl << strLine;
        fout << endl;
        
    }

    return(0);
}
榪欓噷鍙槸嫻厜鎺犲獎(jiǎng)寮忕殑榪囦竴閬嶅ぇ姒傜殑浠g爜錛屽悗闈㈡垜浼?xì)鏈変笓棰樿缁嗚瑙?parse html 鍜?segment docment 絳夋妧鏈?/p>

 

 



]]>
鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙?qiáng)瀹屽叏娉ㄩ噴[6]鍊掓帓绱㈠紩鐨勫緩绔嬬殑紼嬪簭鍒嗘瀽(3) http://m.shnenglu.com/jrckkyy/archive/2009/12/10/102948.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Thu, 10 Dec 2009 15:02:00 GMThttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102948.htmlhttp://m.shnenglu.com/jrckkyy/comments/102948.htmlhttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102948.html#Feedback0http://m.shnenglu.com/jrckkyy/comments/commentRss/102948.htmlhttp://m.shnenglu.com/jrckkyy/services/trackbacks/102948.html榪欓噷浠嬬粛姝e悜绱㈠紩鐨勫緩绔嬶紝濡傛灉鐩存帴寤虹珛鍊掓帓绱㈠紩鏁堢巼涓婂彲鑳戒細(xì)寰堜綆錛屾墍浠ュ彲浠ュ厛浜х敓姝e悜绱㈠紩涓哄悗闈㈢殑鍊掓帓绱㈠紩鎵撲笅鍩虹銆?/p>

 

璇︾粏鐨勬枃浠跺姛鑳藉拰浠嬬粛閮藉湪榪欓噷鏈変簡浠嬬粛鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙?qiáng)瀹屽叏娉ㄩ噴[5]鍊掓帓绱㈠紩鐨勫緩绔嬪強(qiáng)鏂囦歡浠嬬粛

 

CrtForwardIdx.cpp鏂囦歡

 

int main(int argc, char* argv[])    //./CrtForwardIdx Tianwang.raw.***.seg > moon.fidx
{
    ifstream ifsImgInfo(argv[1]);
    if (!ifsImgInfo) 
    {
        cerr << "Cannot open " << argv[1] << " for input\n";
        return -1;
    }

    string strLine,strDocNum;
    int cnt = 0;
    while (getline(ifsImgInfo, strLine)) 
    {
        string::size_type idx;

        cnt++;
        if (cnt%2 == 1) //濂囨暟琛屼負(fù)鏂囨。緙栧彿
        {
            strDocNum = strLine.substr(0,strLine.size());
            continue;
        }
        if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
        {
            continue;
        }

        while ( (idx = strLine.find(SEPARATOR)) != string::npos ) //鎸囧畾鏌ユ壘鍒嗙晫絎?
        {
            string tmp1 = strLine.substr(0,idx);
            cout << tmp1 << "\t" << strDocNum << endl;
            strLine = strLine.substr(idx + SEPARATOR.size());
        }

        //if (cnt==100) break;
    }

    return 0;
}

 

author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

 

 



]]>
鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙?qiáng)瀹屽叏娉ㄩ噴[6]鍊掓帓绱㈠紩鐨勫緩绔嬬殑紼嬪簭鍒嗘瀽(1)http://m.shnenglu.com/jrckkyy/archive/2009/12/10/102945.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Thu, 10 Dec 2009 15:00:00 GMThttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102945.htmlhttp://m.shnenglu.com/jrckkyy/comments/102945.htmlhttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102945.html#Feedback0http://m.shnenglu.com/jrckkyy/comments/commentRss/102945.htmlhttp://m.shnenglu.com/jrckkyy/services/trackbacks/102945.htmlauthor:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

涓婁竴綃囦富瑕佷粙緇嶄簡鍊掓帓绱㈠紩寤虹珛鐩稿叧鐨勬枃浠跺強(qiáng)涓棿鏂囦歡銆?br>TSE寤虹珛绱㈠紩鍦ㄨ繍琛岀▼搴忎笂鐨勫ぇ鑷存楠ゅ彲浠ョ畝鍖栧垎涓轟互涓嬪嚑姝ワ細(xì)

1銆佽繍琛屽懡浠?./DocIndex
浼?xì)鐢ㄥ堫C竴涓枃浠?tianwang.raw.520    //鐖彇鍥炴潵鐨勫師濮嬫枃浠訛紝鍖呭惈澶氫釜緗戦〉鐨勬墍鏈変俊鎭紝鎵浠ュ緢澶э紝榪欎篃鏄竴涓湁寰呰В鍐崇殑闂錛屽埌搴曞瓨鎴愬ぇ鏂囦歡錛堝鏋滆繃澶т細(xì)瓚呰繃2G鎴?G鐨勯檺鍒訛紝鑰屼笖鏂囦歡榪囧ぇ绱㈠紩鏁堢巼榪囦綆錛夎繕鏄皬鏂囦歡錛堟枃浠舵暟榪囧鐢ㄤ簬鎵撳紑鍏抽棴鏂囦歡鍙ユ焺鐨勬秷鑰楄繃澶э級榪樻湁寰呮濊冿紝榪樺氨鏄瓨鍌ㄦ柟妗堢殑瑙e喅鏈緇堣偗瀹氭槸瑕佸瓨涓哄垎甯冨紡鐨勶紝鏈緇堟繪枃浠墮噺鑲畾鏄細(xì)涓奣B鐨勶紝TSE鍙敮鎸佸皬鍨嬬殑鎼滅儲(chǔ)寮曟搸闇姹傘?nbsp;         
浼?xì)漶旂敓涓涓嬩笁涓枃浠?Doc.idx, Url.idx, DocId2Url.idx    //Data鏂囦歡澶逛腑鐨凞oc.idx DocId2Url.idx鍜孌oc.idx

2銆佽繍琛屽懡浠?sort Url.idx|uniq > Url.idx.sort_uniq    //Data鏂囦歡澶逛腑鐨刄rl.idx.sort_uniq
浼?xì)鐢ㄥ堫C竴涓枃浠?Url.idx鏂囦歡 //md5 hash 涔嬪悗鐨剈rl瀹屾暣鍦板潃鍜宒ocument id鍊煎
浼?xì)漶旂敓涓涓枃浠?Url.idx.sort_uniq //URL娑堥噸錛宮d5 hash鎺掑簭錛屾彁楂樻绱㈡晥鐜?/p>

3銆佽繍琛屽懡浠?./DocSegment Tianwang.raw.2559638448 
浼?xì)鐢ㄥ堫C竴涓枃浠?Tianwang.raw.2559638448  //Tianwang.raw.2559638448涓虹埇鍥炴潵鐨勬枃浠?錛屾瘡涓〉闈㈠寘鍚玥ttp澶達(dá)紝鍒嗚瘝涓哄悗闈㈠緩绔嬪埌鎺掔儲(chǔ)寮曞仛鍑嗗
浼?xì)漶旂敓涓涓枃浠?Tianwang.raw.2559638448.seg //鍒嗚瘝鏂囦歡錛岀敱涓琛宒ocument id鍙峰拰涓琛屾枃妗e垎璇嶇粍錛堝彧瀵規(guī)瘡涓枃妗?lt;html></html>涓?lt;head></head><body></body>絳夋枃瀛楁爣璁頒腑鐨勬枃鏈繘琛屽垎緇勶級鏋勬垚

4銆佽繍琛屽懡浠?./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx //寤虹珛鐙珛鐨勬鍚戠儲(chǔ)寮?/p>

5銆佽繍琛屽懡浠?br>#set | grep "LANG"
#LANG=en; export LANG;
#sort moon.fidx > moon.fidx.sort

6銆佽繍琛屽懡浠?./CrtInvertedIdx moon.fidx.sort > sun.iidx //寤虹珛鍊掓帓绱㈠紩

鎴戜滑鍏堜粠寤虹珛绱㈠紩鐨勭涓涓▼搴廌ocIndex.cpp寮濮嬪垎鏋愩?娉ㄩ噴綰﹀畾錛歍ianwang.raw.2559638448鏄姄鍥炴潵鍚堝茍鎴愮殑澶ф枃浠訛紝鍚庨潰灝卞彨澶ф枃浠訛紝閲岄潰鍖呭惈浜嗗緢澶氱瘒html鏂囨。錛岄噷闈㈢殑鏂囨。鏈夎寰嬬殑鍒嗛殧灝卞彨鍋氫竴綃囦竴綃囩殑鏂囨。)


//DocIndex.h start-------------------------------------------------------------

 


#ifndef _COMM_H_040708_
#define _COMM_H_040708_

#include

#include
#include
#include
#include
#include
#include
#include


using namespace std;

const unsigned HEADER_BUF_SIZE = 1024;
const unsigned RstPerPage = 20; //鍓嶅彴鎼滅儲(chǔ)緇撴灉鏁版嵁闆嗚繑鍥炴潯鏁?/p>

//iceway
//const unsigned MAX_DOC_IDX_ID = 21312;  //DocSegment.cpp涓鐢ㄥ埌
const unsigned MAX_DOC_IDX_ID = 22104;


//const string IMG_INFO_NAME("./Data/s1.1");
const string INF_INFO_NAME("./Data/sun.iidx"); //鍊掓帓绱㈠紩鏂囦歡
//鏈卞痙  14383 16151 16151 16151 1683 207 6302 7889 8218 8218 8637
//鏈卞彜鍔?nbsp; 1085 1222

//9涓囧鏉?瀛楀厓鏂囦歡 鍖呮嫭鐗規(guī)畩絎﹀彿錛屾爣鐐癸紝姹夊瓧
const string DOC_IDX_NAME("./Data/Doc.idx"); //鍊掓帓绱㈠紩鏂囦歡
const string RAWPAGE_FILE_NAME("./Data/Tianwang.swu.iceway.1.0");

//iceway
const string DOC_FILE_NAME = "Tianwang.swu.iceway.1.0";  //Docindex.cpp涓鐢ㄥ埌
const string Data_DOC_FILE_NAME = "./Data/Tianwang.swu.iceway.1.0";  //Snapshot.cpp涓鐢ㄥ埌


//const string RM_THUMBNAIL_FILES("rm -f ~/public_html/ImgSE/timg/*");

//const string THUMBNAIL_DIR("/ImgSE/timg/");


#endif _COMM_H_040708_
//DocIndex.h end--------------------------------------------------------------//DocIndex.cpp start-----------------------------------------------------------

#include
#include
#include "Md5.h"
#include "Url.h"
#include "Document.h"

//iceway(mnsc)
#include "Comm.h"
#include

using namespace std;

int main(int argc, char* argv[])
{
    //ifstream ifs("Tianwang.raw.2559638448");
 //ifstream ifs("Tianwang.raw.3023555472");
 //iceway(mnsc)
 ifstream ifs(DOC_FILE_NAME.c_str()); //鎵撳紑Tianwang.raw.3023555472鏂囦歡錛屾渶鍘熷鐨勬枃浠?br> if (!ifs)
 {
     cerr << "Cannot open " << "tianwang.img.info" << " for input\n";
     return -1;
    }
 ofstream ofsUrl("Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //寤虹珛騫舵墦寮Url.idx鏂囦歡
 if( !ofsUrl )
 {
  cout << "error open file " << endl;
 }

 ofstream ofsDoc("Doc.idx", ios::in|ios::out|ios::trunc|ios::binary); //寤虹珛騫舵墦寮Doc.idx鏂囦歡
 if( !ofsDoc )
 {
  cout << "error open file " << endl;
 }

 ofstream ofsDocId2Url("DocId2Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //寤虹珛騫舵墦寮DocId2Url.idx鏂囦歡
 if( !ofsDocId2Url )
 {
  cout << "error open file " << endl;
 }

 int cnt=0; //鏂囨。緙栧彿浠?寮濮嬭綆?br> string strLine,strPage;
 CUrl iUrl;
 CDocument iDocument;
 CMD5 iMD5;
 
 int nOffset = ifs.tellg();
 while (getline(ifs, strLine))
 {
  if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
  {
   nOffset = ifs.tellg();
   continue;
  }

  if (!strncmp(strLine.c_str(), "version: 1.0", 12)) //鍒ゆ柇絎竴琛屾槸鍚︽槸version: 1.0濡傛灉鏄氨瑙f瀽涓嬪幓
  { 
   if(!getline(ifs, strLine)) break;
   if (!strncmp(strLine.c_str(), "url: ", 4)) //鍒ゆ柇絎簩琛屾槸鍚︽槸url: 濡傛灉鏄垯瑙f瀽涓嬪幓
   {
    iUrl.m_sUrl = strLine.substr(5); //鎴彇url: 浜斾釜瀛楃涔嬪悗鐨剈rl鍐呭
    iMD5.GenerateMD5( (unsigned char*)iUrl.m_sUrl.c_str(), iUrl.m_sUrl.size() ); //瀵箄rl鐢╩d5 hash澶勭悊
    iUrl.m_sChecksum = iMD5.ToString(); //灝嗗瓧絎︽暟緇勭粍鍚堟垚瀛楃涓茶繖涓嚱鏁板湪Md5.h涓疄鐜?/p>

   } else
   {
    continue;
   }

   while (getline(ifs, strLine))
   {
    if (!strncmp(strLine.c_str(), "length: ", 8)) //涓鐩磋涓嬪幓鐩村埌鍒ゆ柇婢規(guī)竟(鐩稿絎簲琛?鎯烘瑺琚瘋帒ength: 鏄垯鎺ヤ笅涓嬪幓
    {
     sscanf(strLine.substr(8).c_str(), "%d", &(iDocument.m_nLength)); //灝嗚鍧楁墍浠h〃緗戦〉鐨勫疄闄呯綉欏靛唴瀹歸暱搴︽斁鍏Document鏁版嵁緇撴瀯涓?br>     break;
    }
   }

   getline(ifs, strLine); //璺寵繃鐩稿絎叚琛屾晠鎰忕暀鐨勪竴涓┖琛?/p>

   iDocument.m_nDocId = cnt; //灝嗘枃妗g紪鍙瘋祴鍊煎埌iDocument鏁版嵁緇撴瀯涓?br>   iDocument.m_nPos = nOffset; //鏂囨。緇撳熬鍦ㄥぇ鏂囦歡涓殑緇撴潫琛屽彿
   char *pContent = new char[iDocument.m_nLength+1]; //鏂板緩璇ユ枃妗i暱搴︾殑瀛楃涓叉寚閽?/p>

   memset(pContent, 0, iDocument.m_nLength+1); //姣忎竴浣嶅垵濮嬪寲涓?
   ifs.read(pContent, iDocument.m_nLength); //鏍規(guī)嵁鑾峰緱鐨勬枃妗i暱搴﹁鍙栨竟(鍏朵腑鍖呭惈鍗忚澶?璇誨彇鏂囨。鍐呭
   iMD5.GenerateMD5( (unsigned char*)pContent, iDocument.m_nLength );
   iDocument.m_sChecksum = iMD5.ToString(); //灝嗗瓧絎︽暟緇勭粍鍚堟垚瀛楃涓茶繖涓嚱鏁板湪Md5.h涓疄鐜?br>   
   delete[] pContent;
   
   ofsUrl << iUrl.m_sChecksum ; //灝唌d5hash鍚庣殑url鍐欏叆Url.idx鏂囦歡
   ofsUrl << "\t" << iDocument.m_nDocId << endl; //鍦ㄤ竴琛屼腑涓涓猼ab璺濈鍒嗛殧錛屽皢鏂囦歡緙栧彿鍐欏叆Url.idx鏂囦歡

   ofsDoc << iDocument.m_nDocId ; //灝嗘枃浠剁紪鍙峰啓鍏oc.idx鏂囦歡
   ofsDoc << "\t" << iDocument.m_nPos ; //鍦ㄤ竴琛屼腑涓涓猼ab璺濈鍒嗛殧錛屽皢璇ユ枃妗g粨鏉熻鍙鋒竟(鍚屾牱涔熸槸涓嬩竴鏂囨。寮濮嬭鍙?鍐欏叆Doc.idx鏂囦歡
   //ofsDoc << "\t" << iDocument.m_nLength ;
   ofsDoc << "\t" << iDocument.m_sChecksum << endl; //鍦ㄤ竴琛屼腑涓涓猼ab璺濈鍒嗛殧錛屽皢md5hash鍚庣殑url鍐欏叆Doc.idx鏂囦歡

   ofsDocId2Url << iDocument.m_nDocId ; //灝嗘枃浠剁紪鍙峰啓鍏ocId2Url.idx鏂囦歡
   ofsDocId2Url << "\t" << iUrl.m_sUrl << endl; //灝嗚鏂囨。鐨勫畬鏁磚rl鍐欏叆DocId2Url.idx鏂囦歡

   cnt++; //鏂囨。緙栧彿鍔犱竴璇存槑璇ヤ互鏂囨。鍒嗘瀽瀹屾瘯錛岀敓鎴愪笅涓鏂囨。鐨勭紪鍙?br>  }

  nOffset = ifs.tellg();

 }

 //鏈鍚庝竴琛屽彧鏈夋枃妗e彿鍜屼笂涓綃囨枃妗g粨鏉熷彿
 ofsDoc << cnt ;
 ofsDoc << "\t" << nOffset << endl;


 return(0);
}

//DocIndex.cpp end-----------------------------------------------------------author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

 

 



]]>
鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙?qiáng)瀹屽叏娉ㄩ噴[5]鍊掓帓绱㈠紩鐨勫緩绔嬪強(qiáng)鏂囦歡浠嬬粛http://m.shnenglu.com/jrckkyy/archive/2009/12/10/102943.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Thu, 10 Dec 2009 14:55:00 GMThttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102943.htmlhttp://m.shnenglu.com/jrckkyy/comments/102943.htmlhttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102943.html#Feedback0http://m.shnenglu.com/jrckkyy/comments/commentRss/102943.htmlhttp://m.shnenglu.com/jrckkyy/services/trackbacks/102943.html涓嶅ソ鎰忔濊澶у涔呯瓑浜嗭紝鍓嶄竴闃典竴鐩村湪蹇欒冭瘯錛岀粓浜庣粨鏉熶簡銆傚懙鍛碉紒搴熻瘽涓嶅璇翠簡涓嬮潰鎴戜滑寮濮嬪惂錛?/p>

TSE鐢ㄧ殑鏄皢鎶撳彇鍥炴潵鐨勭綉欏墊枃妗e叏閮ㄨ鍏ヤ竴涓ぇ鏂囨。錛岃鍚庡榪欎竴涓ぇ鏂囨。鍐呯殑鏁版嵁鏁翠綋緇熶竴鐨勫緩绱㈠紩錛屽叾涓寘鍚簡鍑犱釜姝ラ銆?/p>

view plaincopy to clipboardprint?
1.  The document index (Doc.idx) keeps information about each document.  
 
It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.  
 
The information stored in each entry includes a pointer into the repository,  
 
a document length, a document checksum.  
 
 
 
//Doc.idx  鏂囨。緙栧彿 鏂囨。闀垮害    checksum hash鐮?nbsp; 
 
0   0   bc9ce846d7987c4534f53d423380ba70  
 
1   76760   4f47a3cad91f7d35f4bb6b2a638420e5  
 
2   141624  d019433008538f65329ae8e39b86026c  
 
3   142350  5705b8f58110f9ad61b1321c52605795  
 
//Doc.idx   end  
 
 
 
  The url index (url.idx) is used to convert URLs into docIDs.  
 
 
 
//url.idx  
 
5c36868a9c5117eadbda747cbdb0725f    0 
 
3272e136dd90263ee306a835c6c70d77    1 
 
6b8601bb3bb9ab80f868d549b5c5a5f3    2 
 
3f9eba99fa788954b5ff7f35a5db6e1f    3 
 
//url.idx   end  
 
 
 
It is a list of URL checksums with their corresponding docIDs and is sorted by  
 
checksum. In order to find the docID of a particular URL, the URL's checksum  
 
is computed and a binary search is performed on the checksums file to find its  
 
docID.  
 
 
 
    ./DocIndex  
 
        got Doc.idx, Url.idx, DocId2Url.idx //Data鏂囦歡澶逛腑鐨凞oc.idx DocId2Url.idx鍜孌oc.idx涓?nbsp; 
 
 
 
//DocId2Url.idx  
 
0   http://*.*.edu.cn/index.aspx  
 
1   http://*.*.edu.cn/showcontent1.jsp?NewsID=118  
 
2   http://*.*.edu.cn/0102.html  
 
3   http://*.*.edu.cn/0103.html  
 
//DocId2Url.idx end  
 
 
 
2.  sort Url.idx|uniq > Url.idx.sort_uniq    //Data鏂囦歡澶逛腑鐨刄rl.idx.sort_uniq  
 
 
 
//Url.idx.sort_uniq  
 
//瀵筯ash鍊艱繘琛屾帓搴?nbsp; 
 
000bfdfd8b2dedd926b58ba00d40986b    1111 
 
000c7e34b653b5135a2361c6818e48dc    1831 
 
0019d12f438eec910a06a606f570fde8    366 
 
0033f7c005ec776f67f496cd8bc4ae0d    2103 
 
 
 
3. Segment document to terms, (with finding document according to the url)  
 
    ./DocSegment Tianwang.raw.2559638448        //Tianwang.raw.2559638448涓虹埇鍥炴潵鐨勬枃浠?錛屾瘡涓〉闈㈠寘鍚玥ttp澶?nbsp; 
 
        got Tianwang.raw.2559638448.seg       
 
 
 
//Tianwang.raw.2559638448   鐖彇鐨勫師濮嬬綉欏墊枃浠跺湪鏂囨。鍐呴儴姣忎竴涓枃妗d箣闂村簲璇ユ槸閫氳繃version錛?lt;/html>鍜屽洖杞﹀仛鏍囧織浣嶅垎鍓茬殑  
 
version: 1.0 
 
url: http://***.105.138.175/Default2.asp?lang=gb  
 
origin: http://***.105.138.175/  
 
date: Fri, 23 May 2008 20:01:36 GMT  
 
ip: 162.105.138.175 
 
length: 38413 
 
 
 
HTTP/1.1 200 OK  
 
Server: Microsoft-IIS/5.0 
 
Date: Fri, 23 May 2008 11:17:49 GMT  
 
Connection: keep-alive  
 
Connection: Keep-Alive  
 
Content-Length: 38088 
 
Content-Type: text/html; Charset=gb2312  
 
Expires: Fri, 23 May 2008 11:17:49 GMT  
 
Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/  
 
Cache-control: private 
 
 
 
 
 
 
 
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 
 
" 
<html>  
 
<head>  
 
<title>Apabi鏁板瓧璧勬簮騫沖彴</title>  
 
<meta http-equiv="Content-Type" content="text/html; charset=gb2312">  
 
<META NAME="ROBOTS" CONTENT="INDEX,NOFOLLOW">  
 
<META NAME="DESCRIPTION" CONTENT="鏁板瓧鍥句功棣?鏂規(guī)鏁板瓧鍥句功棣?鐢?shù)瀛愬浘涔?鐢?shù)瀛愪?ebook e涔?Apabi 鏁板瓧璧勬簮騫沖彴">  
 
<link rel="stylesheet" type="text/css" href="css\common.css">  
 
 
 
<style type="text/css">  
 
<!--  
 
.style4 {color: #666666}  
 
-->  
 
</style>  
 
 
 
<script LANGUAGE="vbscript">  
 
...  
 
</script>  
 
 
 
<Script Language="javascript">  
 
...  
 
</Script>  
 
</head>  
 
<body leftmargin="0" topmargin="0">  
 
</body>  
 
</html>  
 
//Tianwang.raw.2559638448   end  
 
 
 
//Tianwang.raw.2559638448.seg   灝嗘瘡涓〉闈㈠垎鎴愪竴琛屽涓?娉ㄦ剰涓棿娌℃湁鍥炶濺浣滀負(fù)鍒嗛殧)  
 

 
...  
 
...  
 
...  
 

 
...  
 
...  
 
...  
 
//Tianwang.raw.2559638448.seg   end  
 
 
 
//涓嬫槸 Tiny search 闈炲繀欏誨洜绱?nbsp; 
 
4. Create forward index (docic-->termid)     //寤虹珛姝e悜绱㈠紩  
 
    ./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx  
 
 
 
//Tianwang.raw.2559638448.seg 灝嗘瘡涓〉闈㈠垎鎴愪竴琛屽涓?lt;BR>//鍒嗚瘝   DocID<BR>1<BR>涓夋槦/  s/  鎵嬫満/  璁哄潧/  ,/  鎵嬫満/  閾冨0/  涓嬭澆/  ,/  鎵嬫満/  鍥劇墖/  涓嬭澆/  ,/  鎵嬫満/<BR>2<BR>...<BR>...<BR>... 

1.  The document index (Doc.idx) keeps information about each document.

It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.

The information stored in each entry includes a pointer into the repository,

a document length, a document checksum.

 

//Doc.idx  鏂囨。緙栧彿 鏂囨。闀垮害 checksum hash鐮?/p>

0 0 bc9ce846d7987c4534f53d423380ba70

1 76760 4f47a3cad91f7d35f4bb6b2a638420e5

2 141624 d019433008538f65329ae8e39b86026c

3 142350 5705b8f58110f9ad61b1321c52605795

//Doc.idx end

 

  The url index (url.idx) is used to convert URLs into docIDs.

 

//url.idx

5c36868a9c5117eadbda747cbdb0725f 0

3272e136dd90263ee306a835c6c70d77 1

6b8601bb3bb9ab80f868d549b5c5a5f3 2

3f9eba99fa788954b5ff7f35a5db6e1f 3

//url.idx end

 

It is a list of URL checksums with their corresponding docIDs and is sorted by

checksum. In order to find the docID of a particular URL, the URL's checksum

is computed and a binary search is performed on the checksums file to find its

docID.

 

 ./DocIndex

  got Doc.idx, Url.idx, DocId2Url.idx //Data鏂囦歡澶逛腑鐨凞oc.idx DocId2Url.idx鍜孌oc.idx涓?/p>

 

//DocId2Url.idx

http://*.*.edu.cn/index.aspx

http://*.*.edu.cn/showcontent1.jsp?NewsID=118

http://*.*.edu.cn/0102.html

http://*.*.edu.cn/0103.html

//DocId2Url.idx end

 

2.  sort Url.idx|uniq > Url.idx.sort_uniq //Data鏂囦歡澶逛腑鐨刄rl.idx.sort_uniq

 

//Url.idx.sort_uniq

//瀵筯ash鍊艱繘琛屾帓搴?/p>

000bfdfd8b2dedd926b58ba00d40986b 1111

000c7e34b653b5135a2361c6818e48dc 1831

0019d12f438eec910a06a606f570fde8 366

0033f7c005ec776f67f496cd8bc4ae0d 2103

 

3. Segment document to terms, (with finding document according to the url)

 ./DocSegment Tianwang.raw.2559638448  //Tianwang.raw.2559638448涓虹埇鍥炴潵鐨勬枃浠?錛屾瘡涓〉闈㈠寘鍚玥ttp澶?/p>

  got Tianwang.raw.2559638448.seg  

 

//Tianwang.raw.2559638448 鐖彇鐨勫師濮嬬綉欏墊枃浠跺湪鏂囨。鍐呴儴姣忎竴涓枃妗d箣闂村簲璇ユ槸閫氳繃version錛?lt;/html>鍜屽洖杞﹀仛鏍囧織浣嶅垎鍓茬殑

version: 1.0

url: http://***.105.138.175/Default2.asp?lang=gb

origin: http://***.105.138.175/

date: Fri, 23 May 2008 20:01:36 GMT

ip: 162.105.138.175

length: 38413

 

HTTP/1.1 200 OK

Server: Microsoft-IIS/5.0

Date: Fri, 23 May 2008 11:17:49 GMT

Connection: keep-alive

Connection: Keep-Alive

Content-Length: 38088

Content-Type: text/html; Charset=gb2312

Expires: Fri, 23 May 2008 11:17:49 GMT

Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/

Cache-control: private

 

 

 

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"

"

<html>

<head>

<title>Apabi鏁板瓧璧勬簮騫沖彴</title>

<meta http-equiv="Content-Type" content="text/html; charset=gb2312">

<META NAME="ROBOTS" CONTENT="INDEX,NOFOLLOW">

<META NAME="DESCRIPTION" CONTENT="鏁板瓧鍥句功棣?鏂規(guī)鏁板瓧鍥句功棣?鐢?shù)瀛愬浘涔?鐢?shù)瀛愪?ebook e涔?Apabi 鏁板瓧璧勬簮騫沖彴">

<link rel="stylesheet" type="text/css" href="css\common.css">

 

<style type="text/css">

<!--

.style4 {color: #666666}

-->

</style>

 

<script LANGUAGE="vbscript">

...

</script>

 

<Script Language="javascript">

...

</Script>

</head>

<body leftmargin="0" topmargin="0">

</body>

</html>

//Tianwang.raw.2559638448 end

 

//Tianwang.raw.2559638448.seg 灝嗘瘡涓〉闈㈠垎鎴愪竴琛屽涓?娉ㄦ剰涓棿娌℃湁鍥炶濺浣滀負(fù)鍒嗛殧)

1

...

...

...

2

...

...

...

//Tianwang.raw.2559638448.seg end

 

//涓嬫槸 Tiny search 闈炲繀欏誨洜绱?/p>

4. Create forward index (docic-->termid)  //寤虹珛姝e悜绱㈠紩

 ./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx

 

//Tianwang.raw.2559638448.seg 灝嗘瘡涓〉闈㈠垎鎴愪竴琛屽涓?/鍒嗚瘝   DocID1涓夋槦/  s/  鎵嬫満/  璁哄潧/  ,/  鎵嬫満/  閾冨0/  涓嬭澆/  ,/  鎵嬫満/  鍥劇墖/  涓嬭澆/  ,/  鎵嬫満/2.........view plaincopy to clipboardprint?
//Tianwang.raw.2559638448.seg end  
 
 
//moon.fidx  
 
//姣忕瘒鏂囨。鍙峰搴旀枃妗e唴鍒嗗嚭鏉ョ殑    鍒嗚瘝  DocID  
 
閮戒細(xì)  2391 
 
浣?nbsp;  2391 
 
閭d簺  2391 
 
鎷ユ湁  2391 
 
瀹?nbsp;  2391 
 
鐨?nbsp;  2391 
 
浜?nbsp;  2391 
 
鐨?nbsp;  2391 
 
瑙嗛噹  2391 
 
鍙?nbsp;  2391 
 
紿?nbsp;  2391 
 
鍦?nbsp;  2180 
 
鐮旂┒鐢熼儴    2180 
 
涓婚〉  2180 
 
鍩瑰吇  2180 
 
綆$悊  2180 
 
鏍忕洰  2180 
 
涓嬭澆  2180 
 
錛?nbsp;  2180 
 
銆?nbsp;  2180 
 
鍏充簬  2180 
 
鍋氬ソ  2180 
 
騫?nbsp;  2180 
 
鍥藉  2180 
 
鍏淳  2180 
 
鐮旂┒鐢?2180 
 
欏圭洰  2180 
 
//moon.fidx end  
 
 
 
5.# set | grep "LANG" 
 
LANG=en; export LANG;  
 
sort moon.fidx > moon.fidx.sort  
 
 
 
6. Create inverted index (termid-->docid)    //寤虹珛鍊掓帓绱㈠紩  
 
    ./CrtInvertedIdx moon.fidx.sort > sun.iidx  
 
 
 
//sun.iidx  //鏂囦歡瑙勬ā澶ф鍑忓皯1/2  
 
鑺卞伐   236 
 
鑺辨搗   2103 
 
鑺卞崏   1018 1061 1061 1061 1730 1730 1730 1730 1730 1852 949 949 
 
鑺辮暰   447 447 
 
鑺辨湪   1061 
 
鑺卞憿   1430 
 
鑺辨湡   447 447 447 447 447 525 
 
鑺遍挶   174 236 
 
鑺辮壊   1730 1730 
 
鑺辮壊鍝佺     1660 
 
鑺辯敓   450 526 
 
鑺卞紡   1428 1430 1430 1430 
 
鑺辯汗   1430 1430 
 
鑺卞簭   447 447 447 447 447 450 
 
鑺辯誕   136 137 
 
鑺辮娊   450 450 
 
//sun.iidx  end  
 
 
 
TSESearch   CGI program for query  
 
Snapshot    CGI program for page snapshot  
 
 
<P>  
author:http://hi.baidu.com/jrckkyy  
 
author:http://blog.csdn.net/jrckkyy  
</P> 

 



]]>
鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙?qiáng)瀹屽叏娉ㄩ噴[4]灝忕粨http://m.shnenglu.com/jrckkyy/archive/2009/12/10/102942.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Thu, 10 Dec 2009 14:54:00 GMThttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102942.htmlhttp://m.shnenglu.com/jrckkyy/comments/102942.htmlhttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102942.html#Feedback0http://m.shnenglu.com/jrckkyy/comments/commentRss/102942.htmlhttp://m.shnenglu.com/jrckkyy/services/trackbacks/102942.html閫氳繃鍓嶉潰鐨勪笁綃囨枃绔犵浉淇′綘宸茬粡瀵圭縐樼殑鎼滅儲(chǔ)寮曟搸鏈変簡涓涓劅鎬х殑璁よ瘑錛屽拰鏅氱殑php綾諱技鐨勮剼鏈璦鏈嶅姟鍣ㄧ被浼鹼紝閫氳繃鑾峰彇鍓嶅彴鍏抽敭瀛楋紝閫氳繃瀛楀吀鍒嗚瘝錛屽拰浜嬪厛寤虹珛寤虹珛濂界殑鍊掓帓绱㈠紩榪涜鐩稿叧鎬у垎鏋愶紝寰楀嚭鏌ヨ緇撴瀯鏍煎紡鍖栬緭鍑虹粨鏋溿傝岃繖閲岀殑鎶鏈毦鐐瑰湪浜?/p>

1銆佸瓧鍏哥殑閫夊彇錛堜簨瀹炰笂鏍規(guī)嵁涓嶅悓鏃朵唬涓嶅悓鍦版柟浜轟滑鐨勮璦涔?fàn)鎯槸涓嶄竴鏍風(fēng)殑鎵浠ヨ瀛楀吀鐨勬渶灝忓厓鐨勫彇鍊兼槸涓嶅悓鐨勶級

2銆佸掓帓绱㈠紩鐨勫緩绔嬶紙榪欓噷灝辮娑夊強(qiáng)鍒扮埇铏殑鎶撳彇鍜岀儲(chǔ)寮曠殑寤虹珛鍚庨潰灝嗛噸鐐逛粙緇嶈繖2鐐癸紝鎼滅儲(chǔ)寮曟搸鐨勬晥鐜囧拰鏈嶅姟璐ㄩ噺瀹炴晥鎬х摱棰堝湪榪欓噷錛?/p>

3銆佺浉鍏蟲у垎鏋愶紙瀵規(guī)姄鍥炴潵鐨勬枃妗e垎璇嶅緩绱㈠紩鍜岀敤鎴峰叧閿瓧鍒嗚瘝綆楁硶涓婅瀵瑰簲錛?/p>

鍚庨潰鏂囩珷浼?xì)閲嶇偣浠嬀l嶇埇铏殑鎶撳彇鍜岀儲(chǔ)寮曠殑寤虹珛銆?/p>

]]>
鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙?qiáng)瀹屽叏娉ㄩ噴[3]鏉ュ埌鍏抽敭瀛楀垎璇嶅強(qiáng)鐩稿叧鎬у垎鏋愮▼搴?http://m.shnenglu.com/jrckkyy/archive/2009/12/10/102941.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Thu, 10 Dec 2009 14:53:00 GMThttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102941.htmlhttp://m.shnenglu.com/jrckkyy/comments/102941.htmlhttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102941.html#Feedback0http://m.shnenglu.com/jrckkyy/comments/commentRss/102941.htmlhttp://m.shnenglu.com/jrckkyy/services/trackbacks/102941.html鏈夊墠闈㈡敞閲婃垜浠彲浠ョ煡閬撴煡璇㈠叧閿瓧鍜屽瓧鍏告枃浠跺噯澶囧ソ濂藉悗錛屽皢榪涘叆鐢ㄦ埛鍏抽敭瀛楀垎璇嶉樁孌?/p>

//TSESearch.cpp涓細(xì)

view plaincopy to clipboardprint?
CHzSeg iHzSeg;      //include ChSeg/HzSeg.h  
 
//  
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery);  //灝唃et鍒扮殑鏌ヨ鍙橀噺鍒嗚瘝鍒嗘垚 "鎴?        鐖?      浣犱滑/ 鐨?      鏍煎紡"  
 
vector<STRING></STRING> vecTerm;  
iQuery.ParseQuery(vecTerm);     //灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑  
 
set<STRING></STRING> setRelevantRst;   
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);   
 
gettimeofday(&end_tv,&tz);  
// search end  
//鎼滅儲(chǔ)瀹屾瘯 

 CHzSeg iHzSeg;  //include ChSeg/HzSeg.h

 //
 iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //灝唃et鍒扮殑鏌ヨ鍙橀噺鍒嗚瘝鍒嗘垚 "鎴?  鐖?  浣犱滑/ 鐨?  鏍煎紡"
 
 vector vecTerm;
 iQuery.ParseQuery(vecTerm);  //灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑
 
 set setRelevantRst;
 iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
 
 gettimeofday(&end_tv,&tz);
 // search end
 //鎼滅儲(chǔ)瀹屾瘯view plaincopy to clipboardprint?
鐪婥HzSeg 涓殑榪欎釜鏂規(guī)硶 

鐪婥HzSeg 涓殑榪欎釜鏂規(guī)硶view plaincopy to clipboardprint?
//ChSeg/HzSeg.h 

//ChSeg/HzSeg.hview plaincopy to clipboardprint?
/**  
 * 紼嬪簭緲昏瘧璇存槑  
 * 榪涗竴姝ュ噣鍖栨暟鎹紝杞崲姹夊瓧  
 * @access  public  
 * @param   CDict, string 鍙傛暟鐨勬眽瀛楄鏄?瀛楀吀錛屾煡璇㈠瓧絎︿覆  
 * @return  string 0  
 */  
// process a sentence before segmentation  
//鍦ㄥ垎璇嶅墠澶勭悊鍙ュ瓙  
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const  
{  
    string s2="";  
    unsigned int i,len;  
 
    while (!s1.empty())   
    {  
        unsigned char ch=(unsigned char) s1[0];  
        if(ch<128)   
        { // deal with ASCII  
            i=1;  
            len = s1.size();  
            while (i<LEN len="s1.length();" i="0;" 涓枃鏍囩偣絳夐潪姹夊瓧瀛楃="" if="" else="" yhf="" s1="s1.substr(i);" by="" added="" ch="=13)" s2="" cr=""></LEN>=161)  
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))  
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))  
              && (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)   
              || (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186  
              || (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))   
                {   
                    ii=i+2; // 鍋囧畾娌℃湁鍗婁釜姹夊瓧  
                }  
 
                if (i==0) ii=i+2;  
 
                // 涓嶅鐞嗕腑鏂囩┖鏍?nbsp; 
                if (!(ch==161 && (unsigned char)s1[1]==161))   
                {   
                    if (i <= s1.size())  // yhf  
                        // 鍏朵粬鐨勯潪姹夊瓧鍙屽瓧鑺傚瓧絎﹀彲鑳借繛緇緭鍑?nbsp; 
                        s2 += s1.substr(0, i) + SEPARATOR;   
                    else break; // yhf  
                }  
 
                if (i <= s1.size())  // yhf  
                    s1s1=s1.substr(i);  
                else break;     //yhf  
 
                continue;  
            }  
        }  
      
 
    // 浠ヤ笅澶勭悊姹夊瓧涓?nbsp; 
 
        i = 2;  
        len = s1.length();  
 
        while(i<LEN></LEN>=176)   
//    while(i<LEN></LEN>=128 && (unsigned char)s1[i]!=161)  
            i+=2;  
 
        s2+=SegmentHzStrMM(dict, s1.substr(0,i));  
 
        if (i <= len)    // yhf  
            s1s1=s1.substr(i);  
        else break; // yhf  
    }  
 
    return s2;  

/**
 * 紼嬪簭緲昏瘧璇存槑
 * 榪涗竴姝ュ噣鍖栨暟鎹紝杞崲姹夊瓧
 * @access  public
 * @param   CDict, string 鍙傛暟鐨勬眽瀛楄鏄?瀛楀吀錛屾煡璇㈠瓧絎︿覆
 * @return  string 0
 */
// process a sentence before segmentation
//鍦ㄥ垎璇嶅墠澶勭悊鍙ュ瓙
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
{
 string s2="";
 unsigned int i,len;

 while (!s1.empty())
 {
  unsigned char ch=(unsigned char) s1[0];
  if(ch<128)
  { // deal with ASCII
   i=1;
   len = s1.size();
   while (i=161)
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
              && (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
              || (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
              || (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
    {
     i=i+2; // 鍋囧畾娌℃湁鍗婁釜姹夊瓧
    }

    if (i==0) i=i+2;

    // 涓嶅鐞嗕腑鏂囩┖鏍?br>    if (!(ch==161 && (unsigned char)s1[1]==161))
    {
     if (i <= s1.size()) // yhf
      // 鍏朵粬鐨勯潪姹夊瓧鍙屽瓧鑺傚瓧絎﹀彲鑳借繛緇緭鍑?br>      s2 += s1.substr(0, i) + SEPARATOR;
     else break; // yhf
    }

    if (i <= s1.size()) // yhf
     s1=s1.substr(i);
    else break;  //yhf

    continue;
   }
  }
   

    // 浠ヤ笅澶勭悊姹夊瓧涓?/p>

  i = 2;
  len = s1.length();

  while(i=176)
//    while(i=128 && (unsigned char)s1[i]!=161)
   i+=2;

  s2+=SegmentHzStrMM(dict, s1.substr(0,i));

  if (i <= len) // yhf
   s1=s1.substr(i);
  else break; // yhf
 }

 return s2;
}view plaincopy to clipboardprint?
  

 view plaincopy to clipboardprint?
//Query.cpp 

//Query.cppview plaincopy to clipboardprint?
<PRE class=csharp name="code">/**  
 * 紼嬪簭緲昏瘧璇存槑  
 * 灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑  
 *  
 * @access  public  
 * @param   vector<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細(xì)鍚戦噺瀹瑰櫒  
 * @return  void  
 */  
void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm)  
{  
    string::size_type idx;   
    while ( (idx = m_sSegQuery.find("/  ")) != string::npos ) {   
        vecTerm.push_back(m_sSegQuery.substr(0,idx));   
        m_sSegQuerym_sSegQuery = m_sSegQuery.substr(idx+3);   
    }  
}  
</PRE> 
<PRE class=csharp name="code"> </PRE> 
<PRE class=csharp name="code"><PRE class=csharp name="code">/**  
 * 紼嬪簭緲昏瘧璇存槑  
 * 鐩稿叧鎬у垎鏋愭煡璇紝鏋勯犵粨鏋滈泦鍚坰etRelevantRst //鐡墮鎵鍦?nbsp; 
 *  
 * @access  public  
 * @param   vector<STRING></STRING> map set<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細(xì) 鐢ㄦ埛鎻愪氦鍏抽敭瀛楃殑鍒嗚瘝緇勶紝鍊掓帓绱㈠紩鏄犲皠錛岀浉鍏蟲х粨鏋滈泦鍚?nbsp; 
 * @return  string 0  
 */  
bool CQuery::GetRelevantRst  
(  
    vector<STRING></STRING> &vecTerm,   
    map &mapBuckets,   
    set<STRING></STRING> &setRelevantRst  
) const  
{  
    set<STRING></STRING> setSRst;  
 
    bool bFirst=true;  
    vector<STRING></STRING>::iterator itTerm = vecTerm.begin();  
 
    for ( ; itTerm != vecTerm.end(); ++itTerm )  
    {  
 
        setSRst.clear();  
        copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));  
 
        map mapRstDoc;  
        string docid;  
        int doccnt;  
 
        map::iterator itBuckets = mapBuckets.find(*itTerm);  
        if (itBuckets != mapBuckets.end())  
        {  
            string strBucket = (*itBuckets).second;  
            string::size_type idx;  
            idx = strBucket.find_first_not_of(" ");  
            strBucketstrBucket = strBucket.substr(idx);  
 
            while ( (idx = strBucket.find(" ")) != string::npos )   
            {  
                docid = strBucket.substr(0,idx);  
                doccnt = 0;  
 
                if (docid.empty()) continue;  
 
                map::iterator it = mapRstDoc.find(docid);  
                if ( it != mapRstDoc.end() )  
                {  
                    doccnt = (*it).second + 1;  
                    mapRstDoc.erase(it);  
                }  
                mapRstDoc.insert( pair(docid,doccnt) );  
 
                strBucketstrBucket = strBucket.substr(idx+1);  
            }  
 
            // remember the last one  
            docid = strBucket;  
            doccnt = 0;  
            map::iterator it = mapRstDoc.find(docid);  
            if ( it != mapRstDoc.end() )  
            {  
                doccnt = (*it).second + 1;  
                mapRstDoc.erase(it);  
            }  
            mapRstDoc.insert( pair(docid,doccnt) );  
        }  
 
        // sort by term frequencty  
        multimap > newRstDoc;  
        map::iterator it0 = mapRstDoc.begin();  
        for ( ; it0 != mapRstDoc.end(); ++it0 ){  
            newRstDoc.insert( pair((*it0).second,(*it0).first) );  
        }  
 
        multimap::iterator itNewRstDoc = newRstDoc.begin();  
        setRelevantRst.clear();  
        for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){  
            string docid = (*itNewRstDoc).second;  
 
            if (bFirst==true) {  
                setRelevantRst.insert(docid);  
                continue;  
            }  
 
            if ( setSRst.find(docid) != setSRst.end() ){      
                setRelevantRst.insert(docid);  
            }  
        }  
 
        //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";  
        bFirst = false;  
    }  
    return true;  
}</PRE> 
</PRE> 
鎺ヤ笅鏉ョ殑灝辨槸鐜板疄浜嗭紝鍓嶉潰閮藉彧鏄鐞嗘暟鎹緱鍒?setRelevantRst 榪欎釜鏌ヨ緇撴瀯闆嗗悎,榪欓噷灝變笉澶氳浜嗕笅闈㈠氨鍜宲hp涔嬬被鐨勮剼鏈璦宸笉澶氾紝鏍煎紡鍖栫粨鏋滈泦鍚堝茍鏄劇ず鍑烘潵銆?nbsp;

view plaincopy to clipboardprint?/**   * 紼嬪簭緲昏瘧璇存槑   * 灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑   *   * @access  public   * @param   vector<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細(xì)鍚戦噺瀹瑰櫒   * @return  void   */  void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm)   {       string::size_type idx;        while ( (idx = m_sSegQuery.find("/  ")) != string::npos ) {            vecTerm.push_back(m_sSegQuery.substr(0,idx));            m_sSegQuery = m_sSegQuery.substr(idx+3);        }   }  /**
 * 紼嬪簭緲昏瘧璇存槑
 * 灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑
 *
 * @access  public
 * @param   vector 鍙傛暟鐨勬眽瀛楄鏄庯細(xì)鍚戦噺瀹瑰櫒
 * @return  void
 */
void CQuery::ParseQuery(vector &vecTerm)
{
 string::size_type idx;
 while ( (idx = m_sSegQuery.find("/  ")) != string::npos ) {
  vecTerm.push_back(m_sSegQuery.substr(0,idx));
  m_sSegQuery = m_sSegQuery.substr(idx+3);
 }
}

view plaincopy to clipboardprint?   
view plaincopy to clipboardprint?<PRE class=csharp name="code">/**   * 紼嬪簭緲昏瘧璇存槑   * 鐩稿叧鎬у垎鏋愭煡璇紝鏋勯犵粨鏋滈泦鍚坰etRelevantRst //鐡墮鎵鍦?nbsp;  *   * @access  public   * @param   vector<STRING></STRING> map set<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細(xì) 鐢ㄦ埛鎻愪氦鍏抽敭瀛楃殑鍒嗚瘝緇勶紝鍊掓帓绱㈠紩鏄犲皠錛岀浉鍏蟲х粨鏋滈泦鍚?nbsp;  * @return  string 0   */  bool CQuery::GetRelevantRst   (       vector<STRING></STRING> &vecTerm,        map &mapBuckets,        set<STRING></STRING> &setRelevantRst   ) const  {       set<STRING></STRING> setSRst;         bool bFirst=true;       vector<STRING></STRING>::iterator itTerm = vecTerm.begin();         for ( ; itTerm != vecTerm.end(); ++itTerm )       {             setSRst.clear();           copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));             map mapRstDoc;           string docid;           int doccnt;             map::iterator itBuckets = mapBuckets.find(*itTerm);           if (itBuckets != mapBuckets.end())           {               string strBucket = (*itBuckets).second;               string::size_type idx;               idx = strBucket.find_first_not_of(" ");               strBucket = strBucket.substr(idx);                 while ( (idx = strBucket.find(" ")) != string::npos )                {                   docid = strBucket.substr(0,idx);                   doccnt = 0;                     if (docid.empty()) continue;                     map::iterator it = mapRstDoc.find(docid);                   if ( it != mapRstDoc.end() )                   {                       doccnt = (*it).second + 1;                       mapRstDoc.erase(it);                   }                   mapRstDoc.insert( pair(docid,doccnt) );                     strBucket = strBucket.substr(idx+1);               }                 // remember the last one               docid = strBucket;               doccnt = 0;               map::iterator it = mapRstDoc.find(docid);               if ( it != mapRstDoc.end() )               {                   doccnt = (*it).second + 1;                   mapRstDoc.erase(it);               }               mapRstDoc.insert( pair(docid,doccnt) );           }             // sort by term frequencty           multimap > newRstDoc;           map::iterator it0 = mapRstDoc.begin();           for ( ; it0 != mapRstDoc.end(); ++it0 ){               newRstDoc.insert( pair((*it0).second,(*it0).first) );           }             multimap::iterator itNewRstDoc = newRstDoc.begin();           setRelevantRst.clear();           for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){               string docid = (*itNewRstDoc).second;                 if (bFirst==true) {                   setRelevantRst.insert(docid);                   continue;               }                 if ( setSRst.find(docid) != setSRst.end() ){                       setRelevantRst.insert(docid);               }           }             //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";           bFirst = false;       }       return true;   }</PRE>  view plaincopy to clipboardprint?/**   * 紼嬪簭緲昏瘧璇存槑   * 鐩稿叧鎬у垎鏋愭煡璇紝鏋勯犵粨鏋滈泦鍚坰etRelevantRst //鐡墮鎵鍦?nbsp;  *   * @access  public   * @param   vector<STRING></STRING> map set<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細(xì) 鐢ㄦ埛鎻愪氦鍏抽敭瀛楃殑鍒嗚瘝緇勶紝鍊掓帓绱㈠紩鏄犲皠錛岀浉鍏蟲х粨鏋滈泦鍚?nbsp;  * @return  string 0   */  bool CQuery::GetRelevantRst   (       vector<STRING></STRING> &vecTerm,        map &mapBuckets,        set<STRING></STRING> &setRelevantRst   ) const  {       set<STRING></STRING> setSRst;         bool bFirst=true;       vector<STRING></STRING>::iterator itTerm = vecTerm.begin();         for ( ; itTerm != vecTerm.end(); ++itTerm )       {             setSRst.clear();           copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));             map mapRstDoc;           string docid;           int doccnt;             map::iterator itBuckets = mapBuckets.find(*itTerm);           if (itBuckets != mapBuckets.end())           {               string strBucket = (*itBuckets).second;               string::size_type idx;               idx = strBucket.find_first_not_of(" ");               strBucket = strBucket.substr(idx);                 while ( (idx = strBucket.find(" ")) != string::npos )                {                   docid = strBucket.substr(0,idx);                   doccnt = 0;                     if (docid.empty()) continue;                     map::iterator it = mapRstDoc.find(docid);                   if ( it != mapRstDoc.end() )                   {                       doccnt = (*it).second + 1;                       mapRstDoc.erase(it);                   }                   mapRstDoc.insert( pair(docid,doccnt) );                     strBucket = strBucket.substr(idx+1);               }                 // remember the last one               docid = strBucket;               doccnt = 0;               map::iterator it = mapRstDoc.find(docid);               if ( it != mapRstDoc.end() )               {                   doccnt = (*it).second + 1;                   mapRstDoc.erase(it);               }               mapRstDoc.insert( pair(docid,doccnt) );           }             // sort by term frequencty           multimap > newRstDoc;           map::iterator it0 = mapRstDoc.begin();           for ( ; it0 != mapRstDoc.end(); ++it0 ){               newRstDoc.insert( pair((*it0).second,(*it0).first) );           }             multimap::iterator itNewRstDoc = newRstDoc.begin();           setRelevantRst.clear();           for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){               string docid = (*itNewRstDoc).second;                 if (bFirst==true) {                   setRelevantRst.insert(docid);                   continue;               }                 if ( setSRst.find(docid) != setSRst.end() ){                       setRelevantRst.insert(docid);               }           }             //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";           bFirst = false;       }       return true;   }  /**
 * 紼嬪簭緲昏瘧璇存槑
 * 鐩稿叧鎬у垎鏋愭煡璇紝鏋勯犵粨鏋滈泦鍚坰etRelevantRst //鐡墮鎵鍦?br> *
 * @access  public
 * @param   vector map set 鍙傛暟鐨勬眽瀛楄鏄庯細(xì) 鐢ㄦ埛鎻愪氦鍏抽敭瀛楃殑鍒嗚瘝緇勶紝鍊掓帓绱㈠紩鏄犲皠錛岀浉鍏蟲х粨鏋滈泦鍚?br> * @return  string 0
 */
bool CQuery::GetRelevantRst
(
 vector &vecTerm,
 map &mapBuckets,
 set &setRelevantRst
) const
{
 set setSRst;

 bool bFirst=true;
 vector::iterator itTerm = vecTerm.begin();

 for ( ; itTerm != vecTerm.end(); ++itTerm )
 {

  setSRst.clear();
  copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));

  map mapRstDoc;
  string docid;
  int doccnt;

  map::iterator itBuckets = mapBuckets.find(*itTerm);
  if (itBuckets != mapBuckets.end())
  {
   string strBucket = (*itBuckets).second;
   string::size_type idx;
   idx = strBucket.find_first_not_of(" ");
   strBucket = strBucket.substr(idx);

   while ( (idx = strBucket.find(" ")) != string::npos )
   {
    docid = strBucket.substr(0,idx);
    doccnt = 0;

    if (docid.empty()) continue;

    map::iterator it = mapRstDoc.find(docid);
    if ( it != mapRstDoc.end() )
    {
     doccnt = (*it).second + 1;
     mapRstDoc.erase(it);
    }
    mapRstDoc.insert( pair(docid,doccnt) );

    strBucket = strBucket.substr(idx+1);
   }

   // remember the last one
   docid = strBucket;
   doccnt = 0;
   map::iterator it = mapRstDoc.find(docid);
   if ( it != mapRstDoc.end() )
   {
    doccnt = (*it).second + 1;
    mapRstDoc.erase(it);
   }
   mapRstDoc.insert( pair(docid,doccnt) );
  }

  // sort by term frequencty
  multimap > newRstDoc;
  map::iterator it0 = mapRstDoc.begin();
  for ( ; it0 != mapRstDoc.end(); ++it0 ){
   newRstDoc.insert( pair((*it0).second,(*it0).first) );
  }

  multimap::iterator itNewRstDoc = newRstDoc.begin();
  setRelevantRst.clear();
  for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
   string docid = (*itNewRstDoc).second;

   if (bFirst==true) {
    setRelevantRst.insert(docid);
    continue;
   }

   if ( setSRst.find(docid) != setSRst.end() ){ 
    setRelevantRst.insert(docid);
   }
  }

  //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "";
  bFirst = false;
 }
 return true;
}

鎺ヤ笅鏉ョ殑灝辨槸鐜板疄浜嗭紝鍓嶉潰閮藉彧鏄鐞嗘暟鎹緱鍒?setRelevantRst 榪欎釜鏌ヨ緇撴瀯闆嗗悎,榪欓噷灝變笉澶氳浜嗕笅闈㈠氨鍜宲hp涔嬬被鐨勮剼鏈璦宸笉澶氾紝鏍煎紡鍖栫粨鏋滈泦鍚堝茍鏄劇ず鍑烘潵銆?br>//TSESearch.cpp

view plaincopy to clipboardprint?
//涓嬮潰寮濮嬫樉紺?nbsp; 
    CDisplayRst iDisplayRst;   
    iDisplayRst.ShowTop();   
 
    float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000   
        +((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000;   
 
    iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec,   
            setRelevantRst.size(), iQuery.m_iStart);  
 
    iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart);

 



]]>
鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙?qiáng)瀹屽叏娉ㄩ噴[2]璺繃鏌ヨ澶勭悊紼嬪簭http://m.shnenglu.com/jrckkyy/archive/2009/12/10/102940.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Thu, 10 Dec 2009 14:52:00 GMThttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102940.htmlhttp://m.shnenglu.com/jrckkyy/comments/102940.htmlhttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102940.html#Feedback0http://m.shnenglu.com/jrckkyy/comments/commentRss/102940.htmlhttp://m.shnenglu.com/jrckkyy/services/trackbacks/102940.html鐢變笂涓綃囨枃绔燵鍘焆鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙?qiáng)瀹屽叏娉ㄩ噴[1]瀵繪壘鎼滅儲(chǔ)寮曟搸鍏ュ彛 鎴戜滑鍙互鐭ラ亾鏁翠釜紼嬪簭鏄粠TSESearch.cpp 涓殑main鍑芥暟寮濮嬬殑鎴戜滑閲嶇偣涓涓嬭繖孌典唬鐮?/p>

//TSESearch.cpp CQuery iQuery;
 iQuery.GetInputs();  //鍏蜂綋紼嬪簭寮濮嬫墽琛?br> // current query & result page number
 iQuery.SetQuery();
 iQuery.SetStart();

 // begin to search
 //寮濮嬪叿浣撴悳绱㈢▼搴?br> gettimeofday(&begin_tv,&tz); //寮濮嬭鏃惰幏鍙栫▼搴忚繍琛屾椂闂村樊

 iQuery.GetInvLists(mapBuckets);  //灝嗘墍鏈夊瓧絎﹂泦瀛樺叆鏄犲皠鍙橀噺涓?nbsp;鐡墮鎵鍦?br> iQuery.GetDocIdx(vecDocIdx);  //灝嗗掓帓绱㈠紩瀛樺叆鍚戦噺涓?nbsp; 鐡墮鎵鍦?br> 
 CHzSeg iHzSeg;  //include ChSeg/HzSeg.h
 iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //灝唃et鍒扮殑鏌ヨ鍙橀噺鍒嗚瘝鍒嗘垚 "鎴?  鐖?  浣犱滑/ 鐨?  鏍煎紡"
 
 vector vecTerm;
 iQuery.ParseQuery(vecTerm);  //灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑
 
 set setRelevantRst;
 iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
 
 gettimeofday(&end_tv,&tz);
 // search end
 //鎼滅儲(chǔ)瀹屾瘯鎸夌収欏哄簭鎴戜滑棣栧厛娣卞叆榪沬Query瀵硅薄鐨勭被CQuery  

//Query.cpp

1銆丟etInputs

榪欎釜鏂規(guī)硶鐨勫姛鑳芥槸灝嗗墠鍙癵et榪囨潵鐨勫彉閲忚漿鎹㈠埌HtmlInputs緇撴瀯浣撴暟緇勪腑濡備笅渚嬪瓙鍜屼唬鐮侊細(xì)

//鍋囪鍓嶅彴鏌ヨ鐨勫叧閿瓧鏄?1"鐫HtmlInputs涓唴瀹硅緭鍑哄涓?nbsp; //HtmlInputs[0].Name word  //HtmlInputs[0].Value 1  //HtmlInputs[1].Name www  //HtmlInputs[1].Value 鎼滅儲(chǔ)  //HtmlInputs[2].Name cdtype  //HtmlInputs[2].Value GB

 
/*
 * Get form information throught environment varible.
 * return 0 if succeed, otherwise exit.
 */
/**
 * 紼嬪簭緲昏瘧璇存槑
 * 澶勭悊GET榪囨潵鐨勮〃鍗?br> *
 * @access  public
 * @return  string 0
 */
int CQuery::GetInputs()
{
    int i,j;
 char *mode = getenv("REQUEST_METHOD"); //榪斿洖鐜鍙橀噺鐨勫?榪欓噷鐜鍙橀噺 REQUEST_METHOD 涓?get 鏂規(guī)硶
    char *tempstr; //GET鍙橀噺瀛楃涓叉垨POST瀛楃涓插唴瀹?br> char *in_line; 
 int length;  //GET鍙橀噺涓查暱搴︽垨POST鍐呭闀垮害

 cout << "Content-type: text/html\n\n";
 //cout << "Cache-Control: no-cache\n";
 //cout << "Expires: Tue, 08 Apr 1997 17:20:00 GMT\n";
 //cout << "Expires: 0\n";
 //cout << "Pragma: no-cache\n\n";

 cout << "\n";
 cout << "\n";
 //cout << "\n";
 //cout << "\n";
 //cout << "\n";
 cout << "\n";
 cout.flush(); //閲婃斁杈撳嚭緙撳啿鍖?杈撳嚭澶撮儴head鍜屼箣鍓嶇殑html鏍囩鍐呭
 //cout <<"" << endl;

 if (mode==NULL) return 1;

 if (strcmp(mode, "POST") == 0)
 {
  length = atoi(getenv("CONTENT_LENGTH")); //濡傛灉鏄疨OST鏂規(guī)硶鐫鑾峰緱鐜鍙橀噺CONTENT_LENGTH鐨勬暣鍨嬪?br>  if (length==0 || length>=256)
   return 1;
  in_line = (char*)malloc(length + 1);
  read(STDIN_FILENO, in_line, length);
  in_line[length]='\0';
 }
 else if (strcmp(mode, "GET") == 0)
 {
  char* inputstr = getenv("QUERY_STRING"); //濡傛灉鏄疓ET鏂規(guī)硶鐫鑾峰緱鐜鍙橀噺QUERY_STRING鐨勫瓧絎︿覆鍊?br>  length = strlen(inputstr);
  if (inputstr==0 || length>=256)
   return 1;

  //鑾峰彇get鍐呭闀垮害騫舵妸get 錛熷悗闈㈢殑鍙傛暟璧嬪肩粰鍙橀噺in_line
  in_line = (char*)malloc(length + 1);
  strcpy(in_line, inputstr); //灝忓績婧㈠嚭鏀誨嚮
 }


 tempstr = (char*)malloc(length + 1); //鑾峰彇post鍐呭鎴杇et鍐呭闀垮害
 if(tempstr == NULL)
 {
  printf("\n");
  printf("\n");
  printf("Major failure #1;please notify the webmaster\n");
  printf("\n");
  fflush(stdout); //杈撳嚭緙撳啿鍖?br>  exit(2); //閿欒榪斿洖
 }

 j=0;
 for (i=0; i char
   strcpy(HtmlInputs[HtmlInputCount].Name,tempstr);
   if (i == length - 1)
   {
    strcpy(HtmlInputs[HtmlInputCount].Value,"");
    HtmlInputCount++;
   }
   j=0;
  }
  else if ((in_line[i] == '&') || (i==length-1))
  {
   if (i==length-1)
   {
    if(in_line[i] == '+')tempstr[j]=' ';
    else tempstr[j] = in_line[i];
    j++;
   }
   tempstr[j]='\0';
   CStrFun::Translate(tempstr); //灝哢RL緙栫爜褰㈠紡鐨勫弬鏁拌漿鎹㈡垚瀛楃鍨?%** -> char
   strcpy(HtmlInputs[HtmlInputCount].Value,tempstr);
   HtmlInputCount++;
   j=0;
  }
  else if (in_line[i] == '+')
  {
   tempstr[j]=' ';
   j++;
  }
  else
  {
   tempstr[j]=in_line[i]; //緇勫悎get涓殑鍙橀噺濡倃ord www cdtype
   j++;
  }
  //cout<";
  //cout<";
  //cout.flush();
 }

 /*
 for (int kk = 0; kk < HtmlInputCount ; ++kk )
 {
  cout<<"Name="<";
  cout<<"Value="<";
 }
 //鍋囪鍓嶅彴鏌ヨ鐨勫叧閿瓧鏄?1"杈撳嚭濡備笅
 //Name=word
 //Value=1
 //Name=www
 //Value= 鎼滅儲(chǔ)
 //Name=cdtype
 //Value=GB
 */

 if(in_line) free(in_line);
 if(tempstr) free(tempstr);

 return 0;
}
 
2銆丼etQuery
 
//Query.cpp
void CQuery::SetQuery()
{
 string q = HtmlInputs[0].Value;
 CStrFun::Str2Lower(q,q.size()); //澶у啓鍙樺皬鍐?br> m_sQuery = q;  //鍑嗗鏌ヨ鍏抽敭瀛?br>}
3銆丼etStart
void CQuery::SetQuery()
{
 string q = HtmlInputs[0].Value;
 CStrFun::Str2Lower(q,q.size()); //澶у啓鍙樺皬鍐檞ord鍙橀噺閲岀殑鍊?br> m_sQuery = q;  //璁劇疆鏌ヨ鍏抽敭瀛?br>}

4銆丟etInvLists
 bool CQuery::GetInvLists(map<string, string> &mapBuckets) const
{
 ifstream ifsInvInfo(INF_INFO_NAME.c_str(), ios::binary); //浠ヤ簩榪涘埗褰㈠紡鎵撳紑涓涓枃浠剁殑杈撳叆嫻佺紦鍐詫紝INF_INFO_NAME鍦ㄥご鏂囦歡Comm.h涓畾涔変簡鐨勶紝 const string INF_INFO_NAME("./Data/sun.iidx");
 //鍊掓帓绱㈠紩鏂囦歡绱㈠紩瀛楀拰鏂囨。濂戒箣闂存湁涓涓埗琛ㄧ"\t"
 //鏈卞痙  14383 16151 16151 16151 1683 207 6302 7889 8218 8218 8637
 //鏈卞彜鍔?nbsp; 1085 1222
 
 if (!ifsInvInfo) {
  cerr << "Cannot open " << INF_INFO_NAME << " for input\n";
  return false;
 }
 string strLine, strWord, strDocNum;
 //浠ヨ璇誨彇杈撳叆嫻佺紦鍐插埌瀛楃涓插璞trLine涓茍鍋氬鐞?br> while (getline(ifsInvInfo, strLine)) {
  string::size_type idx;
  string tmp;
  idx = strLine.find("\t");
  strWord = strLine.substr(0,idx);
  strDocNum = strLine.substr(idx+1);
  mapBuckets.insert(map<string,string>::value_type (strWord, strDocNum)); //鍊掓帓琛ㄤ簩欏逛簩緇磋〃瀛樺叆鏄犲皠涓?br> 
  /*
  map<string, string>::iterator iter;
  int kkk = 0;
  for (iter = mapBuckets.begin(); kkk != 10; ++iter)
  {
   cout<<iter->first<<"  "<<iter->second<<"<br>";
   ++kkk;
  }
  cout.flush();
  */
 }
 return true;
}
 
5銆丟etDocIdx
 
bool CQuery::GetDocIdx(vector &vecDocIdx) const
{
 ifstream ifs(DOC_IDX_NAME.c_str(), ios::binary); 
 //0  0  bc9ce846d7987c4534f53d423380ba70
 //1  76760 4f47a3cad91f7d35f4bb6b2a638420e5
 //2  141624 d019433008538f65329ae8e39b86026c

 if (!ifs) {
  cerr << "Cannot open " << DOC_IDX_NAME << " for input\n"; //浠ヤ簩榪涘埗褰㈠紡鎵撳紑涓涓枃浠剁殑杈撳叆嫻佺紦鍐詫紝DOC_IDX_NAME鍦ㄥご鏂囦歡Comm.h涓畾涔変簡鐨勶紝 const string INF_INFO_NAME("./Data/Doc.idx"); 
  return false;
 }

 string strLine, strDocid, strUrl;
 while (getline(ifs,strLine)){
  DocIdx di;

  sscanf( strLine.c_str(), "%d%d", &di.docid, &di.offset ); //鍙繚鐣欎簡鍓嶉潰涓ら」鏂囨。鍙峰拰鍋忕Щ閲?br>  vecDocIdx.push_back(di); //瀵煎叆緇撴瀯浣撳悜閲忎腑
 }

 return true;
}

 



]]>
鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙?qiáng)瀹屽叏娉ㄩ噴[1]瀵繪壘鎼滅儲(chǔ)寮曟搸鍏ュ彛http://m.shnenglu.com/jrckkyy/archive/2009/12/10/102939.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Thu, 10 Dec 2009 14:51:00 GMThttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102939.htmlhttp://m.shnenglu.com/jrckkyy/comments/102939.htmlhttp://m.shnenglu.com/jrckkyy/archive/2009/12/10/102939.html#Feedback0http://m.shnenglu.com/jrckkyy/comments/commentRss/102939.htmlhttp://m.shnenglu.com/jrckkyy/services/trackbacks/102939.html鐢變簬鐧懼害鍗氬http://hi.baidu.com/jrckkyy鍙戣〃鏂囩珷瀛楁暟鏈夐檺錛屼互鍚庡師鍒涙枃绔犲叏閮ㄩ兘鍏堝彂琛ㄥ埌csdn鍜宑u涓婏紝鍐嶅彂琛ㄥ埌鐧懼害鍗氬涓婏紝鐧懼害鍗氬闄や簡鏀懼師鍒涚殑鏂囩珷榪樹富瑕佹斁緗戜笂瀵繪壘鍒扮殑浼樼鏂囩珷銆?/p>

鏈潃榛戝綺劇鎴戝皢闄嗙畫鎶婃渶榪戝垎鏋愭敞閲奣SE鎼滅儲(chǔ)寮曟搸鐨勫績寰楀彂甯冨嚭鏉ワ紝鑰侀笩錛屽ぇ铏撅紝澶х墰錛岄珮鎵嬮榪囧氨鏄簡錛岃嫢鎰挎剰嫻垂鎸囩偣涓嬪皬寮熺殑鍦ㄤ笅涓嶇敋鎰熸縺錛屾湁闂鐨勬湅鍙嬬洿鎺ョ暀璦璁ㄨ銆傜敱浜庢湰浜烘按騫蟲湁闄愶紝鍒嗘瀽鍜岀炕璇戦毦鍏嶆湁閿欏ぇ瀹惰絎戜簡銆?/p>

涓婂鏈熸嫓璇諱簡James F.Kurose钁楃殑銆婅綆楁満緗戠粶-鑷《鍚戜笅鏂規(guī)硶涓巌nternet鐗硅壊(絎笁鐗堥槾褰?銆嬶紝瑙夊緱鍐欏緱紜疄涓嶉敊(甯屾湜娌$湅鐨勬湅鍙嬩竴瀹氳涔版潵鐪嬬湅)錛岃嚜宸變篃鏉ユ悶涓珮鑷《鍚戜笅鐨勫涔?fàn)鏂规硶锛屽厛浠庣敤鎴风湅寰楀埌鐨勪笢瑗垮嚭鍙戝垎鏋愮爺绌舵悳鐑?chǔ)寮曟搸錛屼笅闈㈡垜浠氨鏉ョ湅鐪嬪悇澶ф悳绱㈠紩鎿庢悳绱㈢晫闈㈢殑浠g爜錛屼綘鎵闇瑕佺壒鍒敞鎰忕殑鏄痜orm琛ㄥ崟涓殑action

闆呰檸http://www.yohoo.com/錛?/p>

<form name=s1 style="margin-bottom:0" action="<table cellpadding=0 cellspacing=0 border=0><tr><td>
<input type=text size=30 name=p title="enter search terms here">&nbsp;
<input type=submit value=Search>&nbsp;&nbsp;</td><td><font face=arial size=-2>·&nbsp;
<a href="
search</a><br>·&nbsp;
<a href="
popular</a></font></td></tr></table></form>
璋鋒瓕
http://www.g.cn錛?/p>

<form method=GET action=/search><tr><td nowrap>
<font size=-1><input type=text name=q size=41 maxlength=2048 value="jrckkyy" title="Google 鎼滅儲(chǔ)"> <input type=submit name=btnG value="Google 鎼滅儲(chǔ)"><input type=hidden name=complete value=1><input type=hidden name=hl value="zh-CN"><input type=hidden name=newwindow value=1><input type=hidden name=sa value="2"></font></td></tr></form>
鐧懼害http://www.baidu.com錛?/p>

<form name=f2 action="/s">
<tr valign="middle">
<td nowrap>
<input type=hidden name=ct value="0">
<input type=hidden name=ie value="gb2312">
<input type=hidden name=bs value="jrckkyy">
<input type=hidden name=sr>
<input type=hidden name=z value="">
<input type=hidden name=cl value=3>
<input type=hidden name=f value=8>
<input name=wd size="35" class=i value="jrckkyy" maxlength=100>
<input type=submit value=鐧懼害涓涓?gt; <input type=button value=緇撴灉涓壘 onclick="return bq(f2,1,0);">&nbsp;&nbsp;&nbsp;</td>
<td nowrap><a href="</tr>
</form>
澶╃綉
http://www.tianwang.com/錛?/p>

<form name=f action="/cgi-bin/tw" method=get>
                <td valign=center width=634 background=images/index_image_02.gif>
                    <table height=46 cellspacing=0 cellpadding=0 width=600 align=right  border=0>
                        <tbody>
                            <tr>
                                <td height=50>
                                    <table cellspacing=0 cellpadding=0 width=600 border=0>
                                        <tbody>
                                            <tr>
                                  <td width="524" height="30" valign="bottom">
                                        <div align="center">                                  <input name="word" type="text" size="40" maxlength="255" onClick="this.focus();checkWord(this,1)" onblutesr='checkWord(this,0)' value='璇瘋緭鍏ヨ祫婧愬悕縐?>
                                            <font color=#ffffff> &nbsp;
                                            <select onChange=reRange(this.selectedIndex) name=range>
                                                <script language=javascript>...
                           <!--
                           for(var i = 0; i < rescode.length; i++) ...{
                               if(i == 0) ...{
                                   document.write('<option value="0" selected>' + rescode[i][0] + '</option>');
                               } else ...{
                                   document.write('<option value="' + i + '">' + rescode[i][0] + '</option>');
                               }
                           }
                           document.f.range.selectedIndex = 0;
                           -->
                         </script>
                                            </select>
                                            </font>-<font color=#ffffff>
                                            <select name=cd>
                                                <script language=javascript>...
                           <!--
                           var ind = document.f.range.selectedIndex;
                           var len = (rescode[ind].length - 1) / 2;
                           var sel = 0;
                           for(var i = 0; i < len; i++) ...{
                               document.write('<option value="' + rescode[ind][2*i+1] + '">' + rescode[ind][2*i+2] + '</option>');
                               if(rescode[ind][2*i+1] == 0)
                                   sel = i;
                           }
                           document.f.cd.selectedIndex = sel;
                           -->
                 </script>
                                            </select>
                                            </font></div>
                                    </td>
                <td width="71" valign="bottom"><input id=submit2 type=image height=22 width=40 src="images/so2.gif" align=absMiddle name=submit></td>
              </tr>
                                            <tr>
                                                <td colspan=3 height=25 class=style16>
                                                    <div align=center></div>
                                                </td>
                                            </tr>
                                        </tbody>
                                    </table>
                                </td>
                            </tr>
                        </tbody>
                    </table>
                </td>
            </form>
嫻嬭瘯鏈嶅姟鍣═SE錛?/p>

<form method="get" action="/cgi-bin/index/TSESearch" name="tw">
        <td width="100%" height="25" align="center">                          
        <input type="text" name="word" size="55">
        <input type="submit" value=" 鎼滅儲(chǔ)" name="www">
        </td>                          
        <input type="hidden" name="cdtype" value="GB">                        
        </form>   
鐢變互涓婂嚑涓猣orm鐨勫睘鎬у彲浠ョ湅鍑哄叏閮ㄩ噰鐢ㄧ殑鏄痝et鏂規(guī)硶錛孋GI鍋氫負(fù)澶勭悊紼嬪簭錛屼篃灝辨槸C/C++錛孋GI鍏ㄧО鏄?#8220;鍏叡緗戝叧鐣岄潰”(Common Gateway Interface)錛孒TTP鏈嶅姟鍣ㄤ笌浣犵殑鎴栧叾瀹冩満鍣ㄤ笂鐨勭▼搴忚繘琛?#8220;浜よ皥”鐨勪竴縐嶅伐鍏鳳紝鍏剁▼搴忛』榪愯鍦ㄧ綉緇滄湇鍔″櫒涓娿侰GI閫愭笎琚繎鍑犲勾鏉ョ殑PHP錛孞AVA錛孉SP錛孭ERL錛孭ython錛孯uby絳夊姩鎬佽璦鎵鍙栦唬銆備絾鏄叾鍦ㄩ熷害鍜岃繍琛屾晥鐜囦笂鐨勪紭鍔挎槸鏃犳硶鍙栦唬鐨勩?/p>

浠ヤ笅鏄疶SE CGI鍏ュ彛紼嬪簭娉ㄩ噴錛屽叾浠栨悳绱㈠紩鎿庣殑鍏ュ彛涔熷簲璇ョ被浼?/p>

 

/**//**
 * 紼嬪簭緲昏瘧璇存槑
 * @Copyright (c) 2008, 鐮斿彂閮?br> * All rights reserved.
 *
 * @filesource  TSESearch.cpp
 * @author  jrckkyy <jrckkyy@163.com>
 *
 * Let's start
 *
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/time.h>
#include <unistd.h>

#include <iostream>
#include <fstream>
#include <list>

#include "Comm.h"    //鍖呭惈2涓儲(chǔ)寮曞拰1涓暟鎹枃浠?br>#include "Query.h"    //鍖呭惈鏁版嵁鏌ヨ澶勭悊澶存枃浠?br>#include "Document.h"    //html鏂囨。澶勭悊澶存枃浠?br>#include "StrFun.h"        //瀛楃涓插鐞嗗ご鏂囦歡
#include "ChSeg/Dict.h"    //瀛楀厓瀛楀吀澶勭悊澶存枃浠?br>#include "ChSeg/HzSeg.h"   
#include "DisplayRst.h"    //榪斿洖鏌ヨ緇撴灉欏甸潰澶存枃浠訛紝榪斿洖緇撴灉鍒嗕負(fù)澶撮儴錛屼腑閮紝搴曢儴

using namespace std;

/**//*
 * A inverted file(INF) includes a term-index file & a inverted-lists file.
 * A inverted-lists consists of many bucks(posting lists).
 * The term-index file is stored at vecTerm, and
 * the inverted-lists is sored at mapBuckets.
 */

/**//**
 * 紼嬪簭緲昏瘧璇存槑
 * 鎼滅儲(chǔ)紼嬪簭鍏ュ彛鍓嶅彴鍏抽敭瀛楁彁浜ゅ埌璇gi紼嬪簭 渚嬪錛?/cgi-bin/index/TSESearch?word=123&start=1
 * 鍊掓帓鏂囦歡鍖呮嫭涓涓褰曟绱㈣瘝鏂囦歡鍜屼竴涓掓帓鍒楄〃鏂囦歡銆?br> * 鍊掓帓鍒楄〃鍖呭惈寰堝鏍囧織錛堟彁浜ゅ悕鍗曪級銆?br> * 璁板綍媯(gè)绱㈣瘝鏂囦歡浣跨敤vecTerm鏉ユ帓搴忥紝鍜屽掓帓鍒楄〃鏄敤mapBuckets鏉ユ帓搴忋?br> *
 * @access  public
 * @param   int char 鍙傛暟鐨勬眽瀛楄鏄?鐢ㄤ簬鎺ユ敹鍓嶅彴get浼犻掔殑鍙傛暟
 * @return  string 0
 */
int main(int argc, char* argv[])
...{
    struct timeval begin_tv, end_tv;
    struct timezone tz;

    CDict iDict;
    map<string, string> dictMap, mapBuckets;
    vector<DocIdx> vecDocIdx;    //Document銆俬

    CQuery iQuery;
    iQuery.GetInputs();        //鍏蜂綋紼嬪簭寮濮嬫墽琛?br>    // current query & result page number
    iQuery.SetQuery();
    iQuery.SetStart();

    // begin to search
    //寮濮嬪叿浣撴悳绱㈢▼搴?br>    gettimeofday(&begin_tv,&tz);    //寮濮嬭鏃惰幏鍙栫▼搴忚繍琛屾椂闂村樊

    iQuery.GetInvLists(mapBuckets);        //灝嗘墍鏈夊瓧絎﹂泦瀛樺叆鏄犲皠鍙橀噺涓?nbsp;   鐡墮鎵鍦?br>    iQuery.GetDocIdx(vecDocIdx);        //灝嗗掓帓绱㈠紩瀛樺叆鍚戦噺涓?nbsp;       鐡墮鎵鍦?br>   
    CHzSeg iHzSeg;        //include ChSeg/HzSeg.h
    iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery);    //灝唃et鍒扮殑鏌ヨ鍙橀噺鍒嗚瘝鍒嗘垚 "鎴?        鐖?        浣犱滑/    鐨?        鏍煎紡"
   
    vector<string> vecTerm;
    iQuery.ParseQuery(vecTerm);        //灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑
   
    set<string> setRelevantRst;
    iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
   
    gettimeofday(&end_tv,&tz);
    // search end
    //鎼滅儲(chǔ)瀹屾瘯

    //涓嬮潰寮濮嬫樉紺?br>    CDisplayRst iDisplayRst;
    iDisplayRst.ShowTop();

    float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000
        +((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000;

    iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec,
            setRelevantRst.size(), iQuery.m_iStart);

    iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart);

    return 0;

}

 

 



]]>
亚洲精品蜜桃久久久久久| 国产成人久久激情91| 国産精品久久久久久久| 色综合久久中文综合网| 国产激情久久久久影院老熟女免费 | 777久久精品一区二区三区无码| 国产精品99久久久久久人| 久久亚洲国产午夜精品理论片| 国产亚州精品女人久久久久久 | 国产精品免费看久久久香蕉| 亚洲伊人久久大香线蕉苏妲己| 久久精品无码免费不卡| 久久久久亚洲AV成人网人人网站| 久久精品毛片免费观看| 久久精品不卡| 久久久久亚洲AV成人片| 久久国产成人午夜AV影院| 综合网日日天干夜夜久久| 亚洲精品国产成人99久久| 久久无码中文字幕东京热| 久久国产亚洲高清观看| 久久国产三级无码一区二区| 精品人妻伦九区久久AAA片69| 久久综合九色综合精品| 日本WV一本一道久久香蕉| 久久精品国产影库免费看| 久久夜色精品国产亚洲| 99久久精品免费国产大片| 亚洲国产精品一区二区久久hs| 国产成人无码精品久久久免费 | AAA级久久久精品无码片| 久久亚洲天堂| 久久99国产精品久久| 亚洲精品乱码久久久久久蜜桃不卡 | 久久人人爽人人爽人人片av麻烦 | 久久免费精品视频| 久久夜色精品国产欧美乱| 无码精品久久一区二区三区| 99久久精品免费国产大片| 99久久婷婷国产综合亚洲| 色综合久久无码五十路人妻|