锘??xml version="1.0" encoding="utf-8" standalone="yes"?> int main(int argc, char* argv[]) //./CrtInvertedIdx moon.fidx.sort > sun.iidx string strLine,strDocNum,tmp1=""; if (tmp.size()<2 || tmp.size() > 8) continue; if (tmp1.empty()) tmp1=tmp; if (tmp == tmp1) cout << tmp1 << "\t" << strDocNum << endl; cnt++; return 0; 榪欓噷杈撳叆 Tianwang.raw.*****錛孌oc.idx錛孶rl.idx.sort_uniq絳変笁涓枃浠訛紝杈撳嚭涓涓猅ianwang.raw.***.seg 鍒嗚瘝瀹屾瘯鐨勬枃浠?/p>
int main(int argc, char* argv[]) //ifstream ifs("Tianwang.raw.2559638448"); ifstream ifsUrl("Url.idx.sort_uniq"); //鎺掑簭騫舵秷閲嶅悗鐨剈rl瀛楀吀 while (getline(ifsUrl,strLine)) //鍋忕url瀛楀吀瀛樺叆涓涓悜閲忓唴瀛樹腑 memset(chksum, 0, 33); while (getline(ifsDoc,strLine)) //鍋忕瀛楀吀鏂囦歡灝嗗叾鏀懼叆涓涓悜閲忓唴瀛樹腑 memset(chksum, 0, 33); strFileName += ".seg"; // find document according to docId char *s; // skip Head //iDocument.m_sBody = s; delete[] pContent; CStrFun::ReplaceStr(strLine, " ", " "); return(0); 璇︾粏鐨勬枃浠跺姛鑳藉拰浠嬬粛閮藉湪榪欓噷鏈変簡浠嬬粛鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙?qiáng)瀹屽叏娉ㄩ噴[5]鍊掓帓绱㈠紩鐨勫緩绔嬪強(qiáng)鏂囦歡浠嬬粛 CrtForwardIdx.cpp鏂囦歡 int main(int argc, char* argv[]) //./CrtForwardIdx Tianwang.raw.***.seg > moon.fidx string strLine,strDocNum; cnt++; while ( (idx = strLine.find(SEPARATOR)) != string::npos ) //鎸囧畾鏌ユ壘鍒嗙晫絎? //if (cnt==100) break; return 0; author:http://hi.baidu.com/jrckkyy author:http://blog.csdn.net/jrckkyy
{
ifstream ifsImgInfo(argv[1]);
if (!ifsImgInfo)
{
cerr << "Cannot open " << argv[1] << " for input\n";
return -1;
}
int cnt = 0;
while (getline(ifsImgInfo, strLine))
{
string::size_type idx;
string tmp;
idx = strLine.find("\t");
tmp = strLine.substr(0,idx);
{
strDocNum = strDocNum + " " + strLine.substr(idx+1);
}
else
{
if ( strDocNum.empty() )
strDocNum = strDocNum + " " + strLine.substr(idx+1);
tmp1 = tmp;
strDocNum.clear();
strDocNum = strDocNum + " " + strLine.substr(idx+1);
}
//if (cnt==100) break;
}
cout << tmp1 << "\t" << strDocNum << endl; //鍊掓帓绱㈠紩涓瘡涓瓧鍏稿崟璇嶅悗鐨勬枃妗g紪鍙蜂互table閿負(fù)闂撮殧
}
]]>
{
string strLine, strFileName=argv[1];
CUrl iUrl;
vector<CUrl> vecCUrl;
CDocument iDocument;
vector<CDocument> vecCDocument;
unsigned int docId = 0;
ifstream ifs(strFileName.c_str()); //DocSegment Tianwang.raw.****
if (!ifs)
{
cerr << "Cannot open tianwang.img.info for input\n";
return -1;
}
if (!ifsUrl)
{
cerr << "Cannot open Url.idx.sort_uniq for input\n";
return -1;
}
ifstream ifsDoc("Doc.idx"); //瀛楀吀鏂囦歡
if (!ifsDoc)
{
cerr << "Cannot open Doc.idx for input\n";
return -1;
}
{
char chksum[33];
int docid;
sscanf( strLine.c_str(), "%s%d", chksum, &docid );
iUrl.m_sChecksum = chksum;
iUrl.m_nDocId = docid;
vecCUrl.push_back(iUrl);
}
{
int docid,pos,length;
char chksum[33];
sscanf( strLine.c_str(), "%d%d%d%s", &docid, &pos, &length,chksum );
iDocument.m_nDocId = docid;
iDocument.m_nPos = pos;
iDocument.m_nLength = length;
iDocument.m_sChecksum = chksum;
vecCDocument.push_back(iDocument);
}
ofstream fout(strFileName.c_str(), ios::in|ios::out|ios::trunc|ios::binary); //璁劇疆瀹屾垚鍒嗚瘝鍚庣殑鏁版嵁杈撳嚭鏂囦歡
for ( docId=0; docId<MAX_DOC_ID; docId++ )
{
int length = vecCDocument[docId+1].m_nPos - vecCDocument[docId].m_nPos -1;
char *pContent = new char[length+1];
memset(pContent, 0, length+1);
ifs.seekg(vecCDocument[docId].m_nPos);
ifs.read(pContent, length);
s = pContent;
int bytesRead = 0,newlines = 0;
while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1)
{
if (*s == '\n')
newlines++;
else
newlines = 0;
s++;
bytesRead++;
}
if (bytesRead == HEADER_BUF_SIZE-1) continue;
// skip header
bytesRead = 0,newlines = 0;
while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1)
{
if (*s == '\n')
newlines++;
else
newlines = 0;
s++;
bytesRead++;
}
if (bytesRead == HEADER_BUF_SIZE-1) continue;
iDocument.RemoveTags(s); //鍘婚櫎<>
iDocument.m_sBodyNoTags = s;
string strLine = iDocument.m_sBodyNoTags;
CStrFun::EmptyStr(strLine); // set " \t\r\n" to " "
// segment the document 鍏蜂綋鍒嗚瘝澶勭悊
CHzSeg iHzSeg;
strLine = iHzSeg.SegmentSentenceMM(iDict,strLine);
fout << docId << endl << strLine;
fout << endl;
}
}
榪欓噷鍙槸嫻厜鎺犲獎(jiǎng)寮忕殑榪囦竴閬嶅ぇ姒傜殑浠g爜錛屽悗闈㈡垜浼?xì)鏈変笓棰樿缁嗚瑙?parse html 鍜?segment docment 絳夋妧鏈?/p>
]]>
{
ifstream ifsImgInfo(argv[1]);
if (!ifsImgInfo)
{
cerr << "Cannot open " << argv[1] << " for input\n";
return -1;
}
int cnt = 0;
while (getline(ifsImgInfo, strLine))
{
string::size_type idx;
if (cnt%2 == 1) //濂囨暟琛屼負(fù)鏂囨。緙栧彿
{
strDocNum = strLine.substr(0,strLine.size());
continue;
}
if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
{
continue;
}
{
string tmp1 = strLine.substr(0,idx);
cout << tmp1 << "\t" << strDocNum << endl;
strLine = strLine.substr(idx + SEPARATOR.size());
}
}
}
]]>
author:http://blog.csdn.net/jrckkyy
涓婁竴綃囦富瑕佷粙緇嶄簡鍊掓帓绱㈠紩寤虹珛鐩稿叧鐨勬枃浠跺強(qiáng)涓棿鏂囦歡銆?br>TSE寤虹珛绱㈠紩鍦ㄨ繍琛岀▼搴忎笂鐨勫ぇ鑷存楠ゅ彲浠ョ畝鍖栧垎涓轟互涓嬪嚑姝ワ細(xì)
1銆佽繍琛屽懡浠?./DocIndex
浼?xì)鐢ㄥ堫C竴涓枃浠?tianwang.raw.520 //鐖彇鍥炴潵鐨勫師濮嬫枃浠訛紝鍖呭惈澶氫釜緗戦〉鐨勬墍鏈変俊鎭紝鎵浠ュ緢澶э紝榪欎篃鏄竴涓湁寰呰В鍐崇殑闂錛屽埌搴曞瓨鎴愬ぇ鏂囦歡錛堝鏋滆繃澶т細(xì)瓚呰繃2G鎴?G鐨勯檺鍒訛紝鑰屼笖鏂囦歡榪囧ぇ绱㈠紩鏁堢巼榪囦綆錛夎繕鏄皬鏂囦歡錛堟枃浠舵暟榪囧鐢ㄤ簬鎵撳紑鍏抽棴鏂囦歡鍙ユ焺鐨勬秷鑰楄繃澶э級榪樻湁寰呮濊冿紝榪樺氨鏄瓨鍌ㄦ柟妗堢殑瑙e喅鏈緇堣偗瀹氭槸瑕佸瓨涓哄垎甯冨紡鐨勶紝鏈緇堟繪枃浠墮噺鑲畾鏄細(xì)涓奣B鐨勶紝TSE鍙敮鎸佸皬鍨嬬殑鎼滅儲(chǔ)寮曟搸闇姹傘?nbsp;
浼?xì)漶旂敓涓涓嬩笁涓枃浠?Doc.idx, Url.idx, DocId2Url.idx //Data鏂囦歡澶逛腑鐨凞oc.idx DocId2Url.idx鍜孌oc.idx
2銆佽繍琛屽懡浠?sort Url.idx|uniq > Url.idx.sort_uniq //Data鏂囦歡澶逛腑鐨刄rl.idx.sort_uniq
浼?xì)鐢ㄥ堫C竴涓枃浠?Url.idx鏂囦歡 //md5 hash 涔嬪悗鐨剈rl瀹屾暣鍦板潃鍜宒ocument id鍊煎
浼?xì)漶旂敓涓涓枃浠?Url.idx.sort_uniq //URL娑堥噸錛宮d5 hash鎺掑簭錛屾彁楂樻绱㈡晥鐜?/p>
3銆佽繍琛屽懡浠?./DocSegment Tianwang.raw.2559638448
浼?xì)鐢ㄥ堫C竴涓枃浠?Tianwang.raw.2559638448 //Tianwang.raw.2559638448涓虹埇鍥炴潵鐨勬枃浠?錛屾瘡涓〉闈㈠寘鍚玥ttp澶達(dá)紝鍒嗚瘝涓哄悗闈㈠緩绔嬪埌鎺掔儲(chǔ)寮曞仛鍑嗗
浼?xì)漶旂敓涓涓枃浠?Tianwang.raw.2559638448.seg //鍒嗚瘝鏂囦歡錛岀敱涓琛宒ocument id鍙峰拰涓琛屾枃妗e垎璇嶇粍錛堝彧瀵規(guī)瘡涓枃妗?lt;html></html>涓?lt;head></head><body></body>絳夋枃瀛楁爣璁頒腑鐨勬枃鏈繘琛屽垎緇勶級鏋勬垚
4銆佽繍琛屽懡浠?./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx //寤虹珛鐙珛鐨勬鍚戠儲(chǔ)寮?/p>
5銆佽繍琛屽懡浠?br>#set | grep "LANG"
#LANG=en; export LANG;
#sort moon.fidx > moon.fidx.sort
6銆佽繍琛屽懡浠?./CrtInvertedIdx moon.fidx.sort > sun.iidx //寤虹珛鍊掓帓绱㈠紩
鎴戜滑鍏堜粠寤虹珛绱㈠紩鐨勭涓涓▼搴廌ocIndex.cpp寮濮嬪垎鏋愩?娉ㄩ噴綰﹀畾錛歍ianwang.raw.2559638448鏄姄鍥炴潵鍚堝茍鎴愮殑澶ф枃浠訛紝鍚庨潰灝卞彨澶ф枃浠訛紝閲岄潰鍖呭惈浜嗗緢澶氱瘒html鏂囨。錛岄噷闈㈢殑鏂囨。鏈夎寰嬬殑鍒嗛殧灝卞彨鍋氫竴綃囦竴綃囩殑鏂囨。)
//DocIndex.h start-------------------------------------------------------------
#ifndef _COMM_H_040708_
#define _COMM_H_040708_
#include
#include
#include
#include
#include
#include
#include
#include
using namespace std;
const unsigned HEADER_BUF_SIZE = 1024;
const unsigned RstPerPage = 20; //鍓嶅彴鎼滅儲(chǔ)緇撴灉鏁版嵁闆嗚繑鍥炴潯鏁?/p>
//iceway
//const unsigned MAX_DOC_IDX_ID = 21312; //DocSegment.cpp涓鐢ㄥ埌
const unsigned MAX_DOC_IDX_ID = 22104;
//const string IMG_INFO_NAME("./Data/s1.1");
const string INF_INFO_NAME("./Data/sun.iidx"); //鍊掓帓绱㈠紩鏂囦歡
//鏈卞痙 14383 16151 16151 16151 1683 207 6302 7889 8218 8218 8637
//鏈卞彜鍔?nbsp; 1085 1222
//9涓囧鏉?瀛楀厓鏂囦歡 鍖呮嫭鐗規(guī)畩絎﹀彿錛屾爣鐐癸紝姹夊瓧
const string DOC_IDX_NAME("./Data/Doc.idx"); //鍊掓帓绱㈠紩鏂囦歡
const string RAWPAGE_FILE_NAME("./Data/Tianwang.swu.iceway.1.0");
//iceway
const string DOC_FILE_NAME = "Tianwang.swu.iceway.1.0"; //Docindex.cpp涓鐢ㄥ埌
const string Data_DOC_FILE_NAME = "./Data/Tianwang.swu.iceway.1.0"; //Snapshot.cpp涓鐢ㄥ埌
//const string RM_THUMBNAIL_FILES("rm -f ~/public_html/ImgSE/timg/*");
//const string THUMBNAIL_DIR("/ImgSE/timg/");
#endif _COMM_H_040708_
//DocIndex.h end--------------------------------------------------------------//DocIndex.cpp start-----------------------------------------------------------
#include
#include
#include "Md5.h"
#include "Url.h"
#include "Document.h"
//iceway(mnsc)
#include "Comm.h"
#include
using namespace std;
int main(int argc, char* argv[])
{
//ifstream ifs("Tianwang.raw.2559638448");
//ifstream ifs("Tianwang.raw.3023555472");
//iceway(mnsc)
ifstream ifs(DOC_FILE_NAME.c_str()); //鎵撳紑Tianwang.raw.3023555472鏂囦歡錛屾渶鍘熷鐨勬枃浠?br> if (!ifs)
{
cerr << "Cannot open " << "tianwang.img.info" << " for input\n";
return -1;
}
ofstream ofsUrl("Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //寤虹珛騫舵墦寮Url.idx鏂囦歡
if( !ofsUrl )
{
cout << "error open file " << endl;
}
ofstream ofsDoc("Doc.idx", ios::in|ios::out|ios::trunc|ios::binary); //寤虹珛騫舵墦寮Doc.idx鏂囦歡
if( !ofsDoc )
{
cout << "error open file " << endl;
}
ofstream ofsDocId2Url("DocId2Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //寤虹珛騫舵墦寮DocId2Url.idx鏂囦歡
if( !ofsDocId2Url )
{
cout << "error open file " << endl;
}
int cnt=0; //鏂囨。緙栧彿浠?寮濮嬭綆?br> string strLine,strPage;
CUrl iUrl;
CDocument iDocument;
CMD5 iMD5;
int nOffset = ifs.tellg();
while (getline(ifs, strLine))
{
if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
{
nOffset = ifs.tellg();
continue;
}
if (!strncmp(strLine.c_str(), "version: 1.0", 12)) //鍒ゆ柇絎竴琛屾槸鍚︽槸version: 1.0濡傛灉鏄氨瑙f瀽涓嬪幓
{
if(!getline(ifs, strLine)) break;
if (!strncmp(strLine.c_str(), "url: ", 4)) //鍒ゆ柇絎簩琛屾槸鍚︽槸url: 濡傛灉鏄垯瑙f瀽涓嬪幓
{
iUrl.m_sUrl = strLine.substr(5); //鎴彇url: 浜斾釜瀛楃涔嬪悗鐨剈rl鍐呭
iMD5.GenerateMD5( (unsigned char*)iUrl.m_sUrl.c_str(), iUrl.m_sUrl.size() ); //瀵箄rl鐢╩d5 hash澶勭悊
iUrl.m_sChecksum = iMD5.ToString(); //灝嗗瓧絎︽暟緇勭粍鍚堟垚瀛楃涓茶繖涓嚱鏁板湪Md5.h涓疄鐜?/p>
} else
{
continue;
}
while (getline(ifs, strLine))
{
if (!strncmp(strLine.c_str(), "length: ", 8)) //涓鐩磋涓嬪幓鐩村埌鍒ゆ柇婢規(guī)竟(鐩稿絎簲琛?鎯烘瑺琚瘋帒ength: 鏄垯鎺ヤ笅涓嬪幓
{
sscanf(strLine.substr(8).c_str(), "%d", &(iDocument.m_nLength)); //灝嗚鍧楁墍浠h〃緗戦〉鐨勫疄闄呯綉欏靛唴瀹歸暱搴︽斁鍏Document鏁版嵁緇撴瀯涓?br> break;
}
}
getline(ifs, strLine); //璺寵繃鐩稿絎叚琛屾晠鎰忕暀鐨勪竴涓┖琛?/p>
iDocument.m_nDocId = cnt; //灝嗘枃妗g紪鍙瘋祴鍊煎埌iDocument鏁版嵁緇撴瀯涓?br> iDocument.m_nPos = nOffset; //鏂囨。緇撳熬鍦ㄥぇ鏂囦歡涓殑緇撴潫琛屽彿
char *pContent = new char[iDocument.m_nLength+1]; //鏂板緩璇ユ枃妗i暱搴︾殑瀛楃涓叉寚閽?/p>
memset(pContent, 0, iDocument.m_nLength+1); //姣忎竴浣嶅垵濮嬪寲涓?
ifs.read(pContent, iDocument.m_nLength); //鏍規(guī)嵁鑾峰緱鐨勬枃妗i暱搴﹁鍙栨竟(鍏朵腑鍖呭惈鍗忚澶?璇誨彇鏂囨。鍐呭
iMD5.GenerateMD5( (unsigned char*)pContent, iDocument.m_nLength );
iDocument.m_sChecksum = iMD5.ToString(); //灝嗗瓧絎︽暟緇勭粍鍚堟垚瀛楃涓茶繖涓嚱鏁板湪Md5.h涓疄鐜?br>
delete[] pContent;
ofsUrl << iUrl.m_sChecksum ; //灝唌d5hash鍚庣殑url鍐欏叆Url.idx鏂囦歡
ofsUrl << "\t" << iDocument.m_nDocId << endl; //鍦ㄤ竴琛屼腑涓涓猼ab璺濈鍒嗛殧錛屽皢鏂囦歡緙栧彿鍐欏叆Url.idx鏂囦歡
ofsDoc << iDocument.m_nDocId ; //灝嗘枃浠剁紪鍙峰啓鍏oc.idx鏂囦歡
ofsDoc << "\t" << iDocument.m_nPos ; //鍦ㄤ竴琛屼腑涓涓猼ab璺濈鍒嗛殧錛屽皢璇ユ枃妗g粨鏉熻鍙鋒竟(鍚屾牱涔熸槸涓嬩竴鏂囨。寮濮嬭鍙?鍐欏叆Doc.idx鏂囦歡
//ofsDoc << "\t" << iDocument.m_nLength ;
ofsDoc << "\t" << iDocument.m_sChecksum << endl; //鍦ㄤ竴琛屼腑涓涓猼ab璺濈鍒嗛殧錛屽皢md5hash鍚庣殑url鍐欏叆Doc.idx鏂囦歡
ofsDocId2Url << iDocument.m_nDocId ; //灝嗘枃浠剁紪鍙峰啓鍏ocId2Url.idx鏂囦歡
ofsDocId2Url << "\t" << iUrl.m_sUrl << endl; //灝嗚鏂囨。鐨勫畬鏁磚rl鍐欏叆DocId2Url.idx鏂囦歡
cnt++; //鏂囨。緙栧彿鍔犱竴璇存槑璇ヤ互鏂囨。鍒嗘瀽瀹屾瘯錛岀敓鎴愪笅涓鏂囨。鐨勭紪鍙?br> }
nOffset = ifs.tellg();
}
//鏈鍚庝竴琛屽彧鏈夋枃妗e彿鍜屼笂涓綃囨枃妗g粨鏉熷彿
ofsDoc << cnt ;
ofsDoc << "\t" << nOffset << endl;
return(0);
}
//DocIndex.cpp end-----------------------------------------------------------author:http://hi.baidu.com/jrckkyy
author:http://blog.csdn.net/jrckkyy
TSE鐢ㄧ殑鏄皢鎶撳彇鍥炴潵鐨勭綉欏墊枃妗e叏閮ㄨ鍏ヤ竴涓ぇ鏂囨。錛岃鍚庡榪欎竴涓ぇ鏂囨。鍐呯殑鏁版嵁鏁翠綋緇熶竴鐨勫緩绱㈠紩錛屽叾涓寘鍚簡鍑犱釜姝ラ銆?/p>
view plaincopy to clipboardprint?
1. The document index (Doc.idx) keeps information about each document.
It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.
The information stored in each entry includes a pointer into the repository,
a document length, a document checksum.
//Doc.idx 鏂囨。緙栧彿 鏂囨。闀垮害 checksum hash鐮?nbsp;
0 0 bc9ce846d7987c4534f53d423380ba70
1 76760 4f47a3cad91f7d35f4bb6b2a638420e5
2 141624 d019433008538f65329ae8e39b86026c
3 142350 5705b8f58110f9ad61b1321c52605795
//Doc.idx end
The url index (url.idx) is used to convert URLs into docIDs.
//url.idx
5c36868a9c5117eadbda747cbdb0725f 0
3272e136dd90263ee306a835c6c70d77 1
6b8601bb3bb9ab80f868d549b5c5a5f3 2
3f9eba99fa788954b5ff7f35a5db6e1f 3
//url.idx end
It is a list of URL checksums with their corresponding docIDs and is sorted by
checksum. In order to find the docID of a particular URL, the URL's checksum
is computed and a binary search is performed on the checksums file to find its
docID.
./DocIndex
got Doc.idx, Url.idx, DocId2Url.idx //Data鏂囦歡澶逛腑鐨凞oc.idx DocId2Url.idx鍜孌oc.idx涓?nbsp;
//DocId2Url.idx
0 http://*.*.edu.cn/index.aspx
1 http://*.*.edu.cn/showcontent1.jsp?NewsID=118
2 http://*.*.edu.cn/0102.html
3 http://*.*.edu.cn/0103.html
//DocId2Url.idx end
2. sort Url.idx|uniq > Url.idx.sort_uniq //Data鏂囦歡澶逛腑鐨刄rl.idx.sort_uniq
//Url.idx.sort_uniq
//瀵筯ash鍊艱繘琛屾帓搴?nbsp;
000bfdfd8b2dedd926b58ba00d40986b 1111
000c7e34b653b5135a2361c6818e48dc 1831
0019d12f438eec910a06a606f570fde8 366
0033f7c005ec776f67f496cd8bc4ae0d 2103
3. Segment document to terms, (with finding document according to the url)
./DocSegment Tianwang.raw.2559638448 //Tianwang.raw.2559638448涓虹埇鍥炴潵鐨勬枃浠?錛屾瘡涓〉闈㈠寘鍚玥ttp澶?nbsp;
got Tianwang.raw.2559638448.seg
//Tianwang.raw.2559638448 鐖彇鐨勫師濮嬬綉欏墊枃浠跺湪鏂囨。鍐呴儴姣忎竴涓枃妗d箣闂村簲璇ユ槸閫氳繃version錛?lt;/html>鍜屽洖杞﹀仛鏍囧織浣嶅垎鍓茬殑
version: 1.0
url: http://***.105.138.175/Default2.asp?lang=gb
origin: http://***.105.138.175/
date: Fri, 23 May 2008 20:01:36 GMT
ip: 162.105.138.175
length: 38413
HTTP/1.1 200 OK
Server: Microsoft-IIS/5.0
Date: Fri, 23 May 2008 11:17:49 GMT
Connection: keep-alive
Connection: Keep-Alive
Content-Length: 38088
Content-Type: text/html; Charset=gb2312
Expires: Fri, 23 May 2008 11:17:49 GMT
Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/
Cache-control: private
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"
<html>
<head>
<title>Apabi鏁板瓧璧勬簮騫沖彴</title>
<meta http-equiv="Content-Type" content="text/html; charset=gb2312">
<META NAME="ROBOTS" CONTENT="INDEX,NOFOLLOW">
<META NAME="DESCRIPTION" CONTENT="鏁板瓧鍥句功棣?鏂規(guī)鏁板瓧鍥句功棣?鐢?shù)瀛愬浘涔?鐢?shù)瀛愪?ebook e涔?Apabi 鏁板瓧璧勬簮騫沖彴">
<link rel="stylesheet" type="text/css" href="css\common.css">
<style type="text/css">
<!--
.style4 {color: #666666}
-->
</style>
<script LANGUAGE="vbscript">
...
</script>
<Script Language="javascript">
...
</Script>
</head>
<body leftmargin="0" topmargin="0">
</body>
</html>
//Tianwang.raw.2559638448 end
//Tianwang.raw.2559638448.seg 灝嗘瘡涓〉闈㈠垎鎴愪竴琛屽涓?娉ㄦ剰涓棿娌℃湁鍥炶濺浣滀負(fù)鍒嗛殧)
1
...
...
...
2
...
...
...
//Tianwang.raw.2559638448.seg end
//涓嬫槸 Tiny search 闈炲繀欏誨洜绱?nbsp;
4. Create forward index (docic-->termid) //寤虹珛姝e悜绱㈠紩
./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx
//Tianwang.raw.2559638448.seg 灝嗘瘡涓〉闈㈠垎鎴愪竴琛屽涓?lt;BR>//鍒嗚瘝 DocID<BR>1<BR>涓夋槦/ s/ 鎵嬫満/ 璁哄潧/ ,/ 鎵嬫満/ 閾冨0/ 涓嬭澆/ ,/ 鎵嬫満/ 鍥劇墖/ 涓嬭澆/ ,/ 鎵嬫満/<BR>2<BR>...<BR>...<BR>...
1. The document index (Doc.idx) keeps information about each document.
It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.
The information stored in each entry includes a pointer into the repository,
a document length, a document checksum.
//Doc.idx 鏂囨。緙栧彿 鏂囨。闀垮害 checksum hash鐮?/p>
0 0 bc9ce846d7987c4534f53d423380ba70
1 76760 4f47a3cad91f7d35f4bb6b2a638420e5
2 141624 d019433008538f65329ae8e39b86026c
3 142350 5705b8f58110f9ad61b1321c52605795
//Doc.idx end
The url index (url.idx) is used to convert URLs into docIDs.
//url.idx
5c36868a9c5117eadbda747cbdb0725f 0
3272e136dd90263ee306a835c6c70d77 1
6b8601bb3bb9ab80f868d549b5c5a5f3 2
3f9eba99fa788954b5ff7f35a5db6e1f 3
//url.idx end
It is a list of URL checksums with their corresponding docIDs and is sorted by
checksum. In order to find the docID of a particular URL, the URL's checksum
is computed and a binary search is performed on the checksums file to find its
docID.
./DocIndex
got Doc.idx, Url.idx, DocId2Url.idx //Data鏂囦歡澶逛腑鐨凞oc.idx DocId2Url.idx鍜孌oc.idx涓?/p>
//DocId2Url.idx
0 http://*.*.edu.cn/index.aspx
1 http://*.*.edu.cn/showcontent1.jsp?NewsID=118
//DocId2Url.idx end
2. sort Url.idx|uniq > Url.idx.sort_uniq //Data鏂囦歡澶逛腑鐨刄rl.idx.sort_uniq
//Url.idx.sort_uniq
//瀵筯ash鍊艱繘琛屾帓搴?/p>
000bfdfd8b2dedd926b58ba00d40986b 1111
000c7e34b653b5135a2361c6818e48dc 1831
0019d12f438eec910a06a606f570fde8 366
0033f7c005ec776f67f496cd8bc4ae0d 2103
3. Segment document to terms, (with finding document according to the url)
./DocSegment Tianwang.raw.2559638448 //Tianwang.raw.2559638448涓虹埇鍥炴潵鐨勬枃浠?錛屾瘡涓〉闈㈠寘鍚玥ttp澶?/p>
got Tianwang.raw.2559638448.seg
//Tianwang.raw.2559638448 鐖彇鐨勫師濮嬬綉欏墊枃浠跺湪鏂囨。鍐呴儴姣忎竴涓枃妗d箣闂村簲璇ユ槸閫氳繃version錛?lt;/html>鍜屽洖杞﹀仛鏍囧織浣嶅垎鍓茬殑
version: 1.0
url: http://***.105.138.175/Default2.asp?lang=gb
origin: http://***.105.138.175/
date: Fri, 23 May 2008 20:01:36 GMT
ip: 162.105.138.175
length: 38413
HTTP/1.1 200 OK
Server: Microsoft-IIS/5.0
Date: Fri, 23 May 2008 11:17:49 GMT
Connection: keep-alive
Connection: Keep-Alive
Content-Length: 38088
Content-Type: text/html; Charset=gb2312
Expires: Fri, 23 May 2008 11:17:49 GMT
Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/
Cache-control: private
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1銆佸瓧鍏哥殑閫夊彇錛堜簨瀹炰笂鏍規(guī)嵁涓嶅悓鏃朵唬涓嶅悓鍦版柟浜轟滑鐨勮璦涔?fàn)鎯槸涓嶄竴鏍風(fēng)殑鎵浠ヨ瀛楀吀鐨勬渶灝忓厓鐨勫彇鍊兼槸涓嶅悓鐨勶級
2銆佸掓帓绱㈠紩鐨勫緩绔嬶紙榪欓噷灝辮娑夊強(qiáng)鍒扮埇铏殑鎶撳彇鍜岀儲(chǔ)寮曠殑寤虹珛鍚庨潰灝嗛噸鐐逛粙緇嶈繖2鐐癸紝鎼滅儲(chǔ)寮曟搸鐨勬晥鐜囧拰鏈嶅姟璐ㄩ噺瀹炴晥鎬х摱棰堝湪榪欓噷錛?/p>
3銆佺浉鍏蟲у垎鏋愶紙瀵規(guī)姄鍥炴潵鐨勬枃妗e垎璇嶅緩绱㈠紩鍜岀敤鎴峰叧閿瓧鍒嗚瘝綆楁硶涓婅瀵瑰簲錛?/p>
鍚庨潰鏂囩珷浼?xì)閲嶇偣浠嬀l嶇埇铏殑鎶撳彇鍜岀儲(chǔ)寮曠殑寤虹珛銆?/p>
//TSESearch.cpp涓細(xì)
view plaincopy to clipboardprint?
CHzSeg iHzSeg; //include ChSeg/HzSeg.h
//
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //灝唃et鍒扮殑鏌ヨ鍙橀噺鍒嗚瘝鍒嗘垚 "鎴? 鐖? 浣犱滑/ 鐨? 鏍煎紡"
vector<STRING></STRING> vecTerm;
iQuery.ParseQuery(vecTerm); //灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑
set<STRING></STRING> setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
gettimeofday(&end_tv,&tz);
// search end
//鎼滅儲(chǔ)瀹屾瘯
CHzSeg iHzSeg; //include ChSeg/HzSeg.h
//
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //灝唃et鍒扮殑鏌ヨ鍙橀噺鍒嗚瘝鍒嗘垚 "鎴? 鐖? 浣犱滑/ 鐨? 鏍煎紡"
vector vecTerm;
iQuery.ParseQuery(vecTerm); //灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑
set setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
gettimeofday(&end_tv,&tz);
// search end
//鎼滅儲(chǔ)瀹屾瘯view plaincopy to clipboardprint?
鐪婥HzSeg 涓殑榪欎釜鏂規(guī)硶
鐪婥HzSeg 涓殑榪欎釜鏂規(guī)硶view plaincopy to clipboardprint?
//ChSeg/HzSeg.h
//ChSeg/HzSeg.hview plaincopy to clipboardprint?
/**
* 紼嬪簭緲昏瘧璇存槑
* 榪涗竴姝ュ噣鍖栨暟鎹紝杞崲姹夊瓧
* @access public
* @param CDict, string 鍙傛暟鐨勬眽瀛楄鏄?瀛楀吀錛屾煡璇㈠瓧絎︿覆
* @return string 0
*/
// process a sentence before segmentation
//鍦ㄥ垎璇嶅墠澶勭悊鍙ュ瓙
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
{
string s2="";
unsigned int i,len;
while (!s1.empty())
{
unsigned char ch=(unsigned char) s1[0];
if(ch<128)
{ // deal with ASCII
i=1;
len = s1.size();
while (i<LEN len="s1.length();" i="0;" 涓枃鏍囩偣絳夐潪姹夊瓧瀛楃="" if="" else="" yhf="" s1="s1.substr(i);" by="" added="" ch="=13)" s2="" cr=""></LEN>=161)
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
&& (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
|| (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
|| (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
{
ii=i+2; // 鍋囧畾娌℃湁鍗婁釜姹夊瓧
}
if (i==0) ii=i+2;
// 涓嶅鐞嗕腑鏂囩┖鏍?nbsp;
if (!(ch==161 && (unsigned char)s1[1]==161))
{
if (i <= s1.size()) // yhf
// 鍏朵粬鐨勯潪姹夊瓧鍙屽瓧鑺傚瓧絎﹀彲鑳借繛緇緭鍑?nbsp;
s2 += s1.substr(0, i) + SEPARATOR;
else break; // yhf
}
if (i <= s1.size()) // yhf
s1s1=s1.substr(i);
else break; //yhf
continue;
}
}
// 浠ヤ笅澶勭悊姹夊瓧涓?nbsp;
i = 2;
len = s1.length();
while(i<LEN></LEN>=176)
// while(i<LEN></LEN>=128 && (unsigned char)s1[i]!=161)
i+=2;
s2+=SegmentHzStrMM(dict, s1.substr(0,i));
if (i <= len) // yhf
s1s1=s1.substr(i);
else break; // yhf
}
return s2;
}
/**
* 紼嬪簭緲昏瘧璇存槑
* 榪涗竴姝ュ噣鍖栨暟鎹紝杞崲姹夊瓧
* @access public
* @param CDict, string 鍙傛暟鐨勬眽瀛楄鏄?瀛楀吀錛屾煡璇㈠瓧絎︿覆
* @return string 0
*/
// process a sentence before segmentation
//鍦ㄥ垎璇嶅墠澶勭悊鍙ュ瓙
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
{
string s2="";
unsigned int i,len;
while (!s1.empty())
{
unsigned char ch=(unsigned char) s1[0];
if(ch<128)
{ // deal with ASCII
i=1;
len = s1.size();
while (i=161)
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
&& (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
|| (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
|| (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
{
i=i+2; // 鍋囧畾娌℃湁鍗婁釜姹夊瓧
}
if (i==0) i=i+2;
// 涓嶅鐞嗕腑鏂囩┖鏍?br> if (!(ch==161 && (unsigned char)s1[1]==161))
{
if (i <= s1.size()) // yhf
// 鍏朵粬鐨勯潪姹夊瓧鍙屽瓧鑺傚瓧絎﹀彲鑳借繛緇緭鍑?br> s2 += s1.substr(0, i) + SEPARATOR;
else break; // yhf
}
if (i <= s1.size()) // yhf
s1=s1.substr(i);
else break; //yhf
continue;
}
}
// 浠ヤ笅澶勭悊姹夊瓧涓?/p>
i = 2;
len = s1.length();
while(i=176)
// while(i=128 && (unsigned char)s1[i]!=161)
i+=2;
s2+=SegmentHzStrMM(dict, s1.substr(0,i));
if (i <= len) // yhf
s1=s1.substr(i);
else break; // yhf
}
return s2;
}view plaincopy to clipboardprint?
view plaincopy to clipboardprint?
//Query.cpp
//Query.cppview plaincopy to clipboardprint?
<PRE class=csharp name="code">/**
* 紼嬪簭緲昏瘧璇存槑
* 灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑
*
* @access public
* @param vector<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細(xì)鍚戦噺瀹瑰櫒
* @return void
*/
void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm)
{
string::size_type idx;
while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) {
vecTerm.push_back(m_sSegQuery.substr(0,idx));
m_sSegQuerym_sSegQuery = m_sSegQuery.substr(idx+3);
}
}
</PRE>
<PRE class=csharp name="code"> </PRE>
<PRE class=csharp name="code"><PRE class=csharp name="code">/**
* 紼嬪簭緲昏瘧璇存槑
* 鐩稿叧鎬у垎鏋愭煡璇紝鏋勯犵粨鏋滈泦鍚坰etRelevantRst //鐡墮鎵鍦?nbsp;
*
* @access public
* @param vector<STRING></STRING> map set<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細(xì) 鐢ㄦ埛鎻愪氦鍏抽敭瀛楃殑鍒嗚瘝緇勶紝鍊掓帓绱㈠紩鏄犲皠錛岀浉鍏蟲х粨鏋滈泦鍚?nbsp;
* @return string 0
*/
bool CQuery::GetRelevantRst
(
vector<STRING></STRING> &vecTerm,
map &mapBuckets,
set<STRING></STRING> &setRelevantRst
) const
{
set<STRING></STRING> setSRst;
bool bFirst=true;
vector<STRING></STRING>::iterator itTerm = vecTerm.begin();
for ( ; itTerm != vecTerm.end(); ++itTerm )
{
setSRst.clear();
copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));
map mapRstDoc;
string docid;
int doccnt;
map::iterator itBuckets = mapBuckets.find(*itTerm);
if (itBuckets != mapBuckets.end())
{
string strBucket = (*itBuckets).second;
string::size_type idx;
idx = strBucket.find_first_not_of(" ");
strBucketstrBucket = strBucket.substr(idx);
while ( (idx = strBucket.find(" ")) != string::npos )
{
docid = strBucket.substr(0,idx);
doccnt = 0;
if (docid.empty()) continue;
map::iterator it = mapRstDoc.find(docid);
if ( it != mapRstDoc.end() )
{
doccnt = (*it).second + 1;
mapRstDoc.erase(it);
}
mapRstDoc.insert( pair(docid,doccnt) );
strBucketstrBucket = strBucket.substr(idx+1);
}
// remember the last one
docid = strBucket;
doccnt = 0;
map::iterator it = mapRstDoc.find(docid);
if ( it != mapRstDoc.end() )
{
doccnt = (*it).second + 1;
mapRstDoc.erase(it);
}
mapRstDoc.insert( pair(docid,doccnt) );
}
// sort by term frequencty
multimap > newRstDoc;
map::iterator it0 = mapRstDoc.begin();
for ( ; it0 != mapRstDoc.end(); ++it0 ){
newRstDoc.insert( pair((*it0).second,(*it0).first) );
}
multimap::iterator itNewRstDoc = newRstDoc.begin();
setRelevantRst.clear();
for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
string docid = (*itNewRstDoc).second;
if (bFirst==true) {
setRelevantRst.insert(docid);
continue;
}
if ( setSRst.find(docid) != setSRst.end() ){
setRelevantRst.insert(docid);
}
}
//cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";
bFirst = false;
}
return true;
}</PRE>
</PRE>
鎺ヤ笅鏉ョ殑灝辨槸鐜板疄浜嗭紝鍓嶉潰閮藉彧鏄鐞嗘暟鎹緱鍒?setRelevantRst 榪欎釜鏌ヨ緇撴瀯闆嗗悎,榪欓噷灝變笉澶氳浜嗕笅闈㈠氨鍜宲hp涔嬬被鐨勮剼鏈璦宸笉澶氾紝鏍煎紡鍖栫粨鏋滈泦鍚堝茍鏄劇ず鍑烘潵銆?nbsp;
view plaincopy to clipboardprint?/** * 紼嬪簭緲昏瘧璇存槑 * 灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑 * * @access public * @param vector<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細(xì)鍚戦噺瀹瑰櫒 * @return void */ void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm) { string::size_type idx; while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) { vecTerm.push_back(m_sSegQuery.substr(0,idx)); m_sSegQuery = m_sSegQuery.substr(idx+3); } } /**
* 紼嬪簭緲昏瘧璇存槑
* 灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑
*
* @access public
* @param vector 鍙傛暟鐨勬眽瀛楄鏄庯細(xì)鍚戦噺瀹瑰櫒
* @return void
*/
void CQuery::ParseQuery(vector &vecTerm)
{
string::size_type idx;
while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) {
vecTerm.push_back(m_sSegQuery.substr(0,idx));
m_sSegQuery = m_sSegQuery.substr(idx+3);
}
}
view plaincopy to clipboardprint?
view plaincopy to clipboardprint?<PRE class=csharp name="code">/** * 紼嬪簭緲昏瘧璇存槑 * 鐩稿叧鎬у垎鏋愭煡璇紝鏋勯犵粨鏋滈泦鍚坰etRelevantRst //鐡墮鎵鍦?nbsp; * * @access public * @param vector<STRING></STRING> map set<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細(xì) 鐢ㄦ埛鎻愪氦鍏抽敭瀛楃殑鍒嗚瘝緇勶紝鍊掓帓绱㈠紩鏄犲皠錛岀浉鍏蟲х粨鏋滈泦鍚?nbsp; * @return string 0 */ bool CQuery::GetRelevantRst ( vector<STRING></STRING> &vecTerm, map &mapBuckets, set<STRING></STRING> &setRelevantRst ) const { set<STRING></STRING> setSRst; bool bFirst=true; vector<STRING></STRING>::iterator itTerm = vecTerm.begin(); for ( ; itTerm != vecTerm.end(); ++itTerm ) { setSRst.clear(); copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin())); map mapRstDoc; string docid; int doccnt; map::iterator itBuckets = mapBuckets.find(*itTerm); if (itBuckets != mapBuckets.end()) { string strBucket = (*itBuckets).second; string::size_type idx; idx = strBucket.find_first_not_of(" "); strBucket = strBucket.substr(idx); while ( (idx = strBucket.find(" ")) != string::npos ) { docid = strBucket.substr(0,idx); doccnt = 0; if (docid.empty()) continue; map::iterator it = mapRstDoc.find(docid); if ( it != mapRstDoc.end() ) { doccnt = (*it).second + 1; mapRstDoc.erase(it); } mapRstDoc.insert( pair(docid,doccnt) ); strBucket = strBucket.substr(idx+1); } // remember the last one docid = strBucket; doccnt = 0; map::iterator it = mapRstDoc.find(docid); if ( it != mapRstDoc.end() ) { doccnt = (*it).second + 1; mapRstDoc.erase(it); } mapRstDoc.insert( pair(docid,doccnt) ); } // sort by term frequencty multimap > newRstDoc; map::iterator it0 = mapRstDoc.begin(); for ( ; it0 != mapRstDoc.end(); ++it0 ){ newRstDoc.insert( pair((*it0).second,(*it0).first) ); } multimap::iterator itNewRstDoc = newRstDoc.begin(); setRelevantRst.clear(); for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){ string docid = (*itNewRstDoc).second; if (bFirst==true) { setRelevantRst.insert(docid); continue; } if ( setSRst.find(docid) != setSRst.end() ){ setRelevantRst.insert(docid); } } //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>"; bFirst = false; } return true; }</PRE> view plaincopy to clipboardprint?/** * 紼嬪簭緲昏瘧璇存槑 * 鐩稿叧鎬у垎鏋愭煡璇紝鏋勯犵粨鏋滈泦鍚坰etRelevantRst //鐡墮鎵鍦?nbsp; * * @access public * @param vector<STRING></STRING> map set<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細(xì) 鐢ㄦ埛鎻愪氦鍏抽敭瀛楃殑鍒嗚瘝緇勶紝鍊掓帓绱㈠紩鏄犲皠錛岀浉鍏蟲х粨鏋滈泦鍚?nbsp; * @return string 0 */ bool CQuery::GetRelevantRst ( vector<STRING></STRING> &vecTerm, map &mapBuckets, set<STRING></STRING> &setRelevantRst ) const { set<STRING></STRING> setSRst; bool bFirst=true; vector<STRING></STRING>::iterator itTerm = vecTerm.begin(); for ( ; itTerm != vecTerm.end(); ++itTerm ) { setSRst.clear(); copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin())); map mapRstDoc; string docid; int doccnt; map::iterator itBuckets = mapBuckets.find(*itTerm); if (itBuckets != mapBuckets.end()) { string strBucket = (*itBuckets).second; string::size_type idx; idx = strBucket.find_first_not_of(" "); strBucket = strBucket.substr(idx); while ( (idx = strBucket.find(" ")) != string::npos ) { docid = strBucket.substr(0,idx); doccnt = 0; if (docid.empty()) continue; map::iterator it = mapRstDoc.find(docid); if ( it != mapRstDoc.end() ) { doccnt = (*it).second + 1; mapRstDoc.erase(it); } mapRstDoc.insert( pair(docid,doccnt) ); strBucket = strBucket.substr(idx+1); } // remember the last one docid = strBucket; doccnt = 0; map::iterator it = mapRstDoc.find(docid); if ( it != mapRstDoc.end() ) { doccnt = (*it).second + 1; mapRstDoc.erase(it); } mapRstDoc.insert( pair(docid,doccnt) ); } // sort by term frequencty multimap > newRstDoc; map::iterator it0 = mapRstDoc.begin(); for ( ; it0 != mapRstDoc.end(); ++it0 ){ newRstDoc.insert( pair((*it0).second,(*it0).first) ); } multimap::iterator itNewRstDoc = newRstDoc.begin(); setRelevantRst.clear(); for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){ string docid = (*itNewRstDoc).second; if (bFirst==true) { setRelevantRst.insert(docid); continue; } if ( setSRst.find(docid) != setSRst.end() ){ setRelevantRst.insert(docid); } } //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>"; bFirst = false; } return true; } /**
* 紼嬪簭緲昏瘧璇存槑
* 鐩稿叧鎬у垎鏋愭煡璇紝鏋勯犵粨鏋滈泦鍚坰etRelevantRst //鐡墮鎵鍦?br> *
* @access public
* @param vector map set 鍙傛暟鐨勬眽瀛楄鏄庯細(xì) 鐢ㄦ埛鎻愪氦鍏抽敭瀛楃殑鍒嗚瘝緇勶紝鍊掓帓绱㈠紩鏄犲皠錛岀浉鍏蟲х粨鏋滈泦鍚?br> * @return string 0
*/
bool CQuery::GetRelevantRst
(
vector &vecTerm,
map &mapBuckets,
set &setRelevantRst
) const
{
set setSRst;
bool bFirst=true;
vector::iterator itTerm = vecTerm.begin();
for ( ; itTerm != vecTerm.end(); ++itTerm )
{
setSRst.clear();
copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));
map mapRstDoc;
string docid;
int doccnt;
map::iterator itBuckets = mapBuckets.find(*itTerm);
if (itBuckets != mapBuckets.end())
{
string strBucket = (*itBuckets).second;
string::size_type idx;
idx = strBucket.find_first_not_of(" ");
strBucket = strBucket.substr(idx);
while ( (idx = strBucket.find(" ")) != string::npos )
{
docid = strBucket.substr(0,idx);
doccnt = 0;
if (docid.empty()) continue;
map::iterator it = mapRstDoc.find(docid);
if ( it != mapRstDoc.end() )
{
doccnt = (*it).second + 1;
mapRstDoc.erase(it);
}
mapRstDoc.insert( pair(docid,doccnt) );
strBucket = strBucket.substr(idx+1);
}
// remember the last one
docid = strBucket;
doccnt = 0;
map::iterator it = mapRstDoc.find(docid);
if ( it != mapRstDoc.end() )
{
doccnt = (*it).second + 1;
mapRstDoc.erase(it);
}
mapRstDoc.insert( pair(docid,doccnt) );
}
// sort by term frequencty
multimap > newRstDoc;
map::iterator it0 = mapRstDoc.begin();
for ( ; it0 != mapRstDoc.end(); ++it0 ){
newRstDoc.insert( pair((*it0).second,(*it0).first) );
}
multimap::iterator itNewRstDoc = newRstDoc.begin();
setRelevantRst.clear();
for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
string docid = (*itNewRstDoc).second;
if (bFirst==true) {
setRelevantRst.insert(docid);
continue;
}
if ( setSRst.find(docid) != setSRst.end() ){
setRelevantRst.insert(docid);
}
}
//cout << "setRelevantRst.size(): " << setRelevantRst.size() << "";
bFirst = false;
}
return true;
}
鎺ヤ笅鏉ョ殑灝辨槸鐜板疄浜嗭紝鍓嶉潰閮藉彧鏄鐞嗘暟鎹緱鍒?setRelevantRst 榪欎釜鏌ヨ緇撴瀯闆嗗悎,榪欓噷灝變笉澶氳浜嗕笅闈㈠氨鍜宲hp涔嬬被鐨勮剼鏈璦宸笉澶氾紝鏍煎紡鍖栫粨鏋滈泦鍚堝茍鏄劇ず鍑烘潵銆?br>//TSESearch.cpp
view plaincopy to clipboardprint?
//涓嬮潰寮濮嬫樉紺?nbsp;
CDisplayRst iDisplayRst;
iDisplayRst.ShowTop();
float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000
+((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000;
iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec,
setRelevantRst.size(), iQuery.m_iStart);
iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart);
//TSESearch.cpp CQuery iQuery;
iQuery.GetInputs(); //鍏蜂綋紼嬪簭寮濮嬫墽琛?br> // current query & result page number
iQuery.SetQuery();
iQuery.SetStart();
// begin to search
//寮濮嬪叿浣撴悳绱㈢▼搴?br> gettimeofday(&begin_tv,&tz); //寮濮嬭鏃惰幏鍙栫▼搴忚繍琛屾椂闂村樊
iQuery.GetInvLists(mapBuckets); //灝嗘墍鏈夊瓧絎﹂泦瀛樺叆鏄犲皠鍙橀噺涓?nbsp;鐡墮鎵鍦?br> iQuery.GetDocIdx(vecDocIdx); //灝嗗掓帓绱㈠紩瀛樺叆鍚戦噺涓?nbsp; 鐡墮鎵鍦?br>
CHzSeg iHzSeg; //include ChSeg/HzSeg.h
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //灝唃et鍒扮殑鏌ヨ鍙橀噺鍒嗚瘝鍒嗘垚 "鎴? 鐖? 浣犱滑/ 鐨? 鏍煎紡"
vector vecTerm;
iQuery.ParseQuery(vecTerm); //灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑
set setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
gettimeofday(&end_tv,&tz);
// search end
//鎼滅儲(chǔ)瀹屾瘯鎸夌収欏哄簭鎴戜滑棣栧厛娣卞叆榪沬Query瀵硅薄鐨勭被CQuery
//Query.cpp
1銆丟etInputs
榪欎釜鏂規(guī)硶鐨勫姛鑳芥槸灝嗗墠鍙癵et榪囨潵鐨勫彉閲忚漿鎹㈠埌HtmlInputs緇撴瀯浣撴暟緇勪腑濡備笅渚嬪瓙鍜屼唬鐮侊細(xì)
//鍋囪鍓嶅彴鏌ヨ鐨勫叧閿瓧鏄?1"鐫HtmlInputs涓唴瀹硅緭鍑哄涓?nbsp; //HtmlInputs[0].Name word //HtmlInputs[0].Value 1 //HtmlInputs[1].Name www //HtmlInputs[1].Value 鎼滅儲(chǔ) //HtmlInputs[2].Name cdtype //HtmlInputs[2].Value GB
/*
* Get form information throught environment varible.
* return 0 if succeed, otherwise exit.
*/
/**
* 紼嬪簭緲昏瘧璇存槑
* 澶勭悊GET榪囨潵鐨勮〃鍗?br> *
* @access public
* @return string 0
*/
int CQuery::GetInputs()
{
int i,j;
char *mode = getenv("REQUEST_METHOD"); //榪斿洖鐜鍙橀噺鐨勫?榪欓噷鐜鍙橀噺 REQUEST_METHOD 涓?get 鏂規(guī)硶
char *tempstr; //GET鍙橀噺瀛楃涓叉垨POST瀛楃涓插唴瀹?br> char *in_line;
int length; //GET鍙橀噺涓查暱搴︽垨POST鍐呭闀垮害
cout << "Content-type: text/html\n\n";
//cout << "Cache-Control: no-cache\n";
//cout << "Expires: Tue, 08 Apr 1997 17:20:00 GMT\n";
//cout << "Expires: 0\n";
//cout << "Pragma: no-cache\n\n";
cout << "\n";
cout << "\n";
//cout << "\n";
//cout << "\n";
//cout << "\n";
cout << "\n";
cout.flush(); //閲婃斁杈撳嚭緙撳啿鍖?杈撳嚭澶撮儴head鍜屼箣鍓嶇殑html鏍囩鍐呭
//cout <<"" << endl;
if (mode==NULL) return 1;
if (strcmp(mode, "POST") == 0)
{
length = atoi(getenv("CONTENT_LENGTH")); //濡傛灉鏄疨OST鏂規(guī)硶鐫鑾峰緱鐜鍙橀噺CONTENT_LENGTH鐨勬暣鍨嬪?br> if (length==0 || length>=256)
return 1;
in_line = (char*)malloc(length + 1);
read(STDIN_FILENO, in_line, length);
in_line[length]='\0';
}
else if (strcmp(mode, "GET") == 0)
{
char* inputstr = getenv("QUERY_STRING"); //濡傛灉鏄疓ET鏂規(guī)硶鐫鑾峰緱鐜鍙橀噺QUERY_STRING鐨勫瓧絎︿覆鍊?br> length = strlen(inputstr);
if (inputstr==0 || length>=256)
return 1;
//鑾峰彇get鍐呭闀垮害騫舵妸get 錛熷悗闈㈢殑鍙傛暟璧嬪肩粰鍙橀噺in_line
in_line = (char*)malloc(length + 1);
strcpy(in_line, inputstr); //灝忓績婧㈠嚭鏀誨嚮
}
tempstr = (char*)malloc(length + 1); //鑾峰彇post鍐呭鎴杇et鍐呭闀垮害
if(tempstr == NULL)
{
printf("\n");
printf("\n");
printf("Major failure #1;please notify the webmaster\n");
printf("\n");
fflush(stdout); //杈撳嚭緙撳啿鍖?br> exit(2); //閿欒榪斿洖
}
j=0;
for (i=0; i char
strcpy(HtmlInputs[HtmlInputCount].Name,tempstr);
if (i == length - 1)
{
strcpy(HtmlInputs[HtmlInputCount].Value,"");
HtmlInputCount++;
}
j=0;
}
else if ((in_line[i] == '&') || (i==length-1))
{
if (i==length-1)
{
if(in_line[i] == '+')tempstr[j]=' ';
else tempstr[j] = in_line[i];
j++;
}
tempstr[j]='\0';
CStrFun::Translate(tempstr); //灝哢RL緙栫爜褰㈠紡鐨勫弬鏁拌漿鎹㈡垚瀛楃鍨?%** -> char
strcpy(HtmlInputs[HtmlInputCount].Value,tempstr);
HtmlInputCount++;
j=0;
}
else if (in_line[i] == '+')
{
tempstr[j]=' ';
j++;
}
else
{
tempstr[j]=in_line[i]; //緇勫悎get涓殑鍙橀噺濡倃ord www cdtype
j++;
}
//cout<";
//cout<";
//cout.flush();
}
/*
for (int kk = 0; kk < HtmlInputCount ; ++kk )
{
cout<<"Name="<";
cout<<"Value="<";
}
//鍋囪鍓嶅彴鏌ヨ鐨勫叧閿瓧鏄?1"杈撳嚭濡備笅
//Name=word
//Value=1
//Name=www
//Value= 鎼滅儲(chǔ)
//Name=cdtype
//Value=GB
*/
if(in_line) free(in_line);
if(tempstr) free(tempstr);
return 0;
}
2銆丼etQuery
//Query.cpp
void CQuery::SetQuery()
{
string q = HtmlInputs[0].Value;
CStrFun::Str2Lower(q,q.size()); //澶у啓鍙樺皬鍐?br> m_sQuery = q; //鍑嗗鏌ヨ鍏抽敭瀛?br>}
3銆丼etStart
void CQuery::SetQuery()
{
string q = HtmlInputs[0].Value;
CStrFun::Str2Lower(q,q.size()); //澶у啓鍙樺皬鍐檞ord鍙橀噺閲岀殑鍊?br> m_sQuery = q; //璁劇疆鏌ヨ鍏抽敭瀛?br>}
4銆丟etInvLists
bool CQuery::GetInvLists(map<string, string> &mapBuckets) const
{
ifstream ifsInvInfo(INF_INFO_NAME.c_str(), ios::binary); //浠ヤ簩榪涘埗褰㈠紡鎵撳紑涓涓枃浠剁殑杈撳叆嫻佺紦鍐詫紝INF_INFO_NAME鍦ㄥご鏂囦歡Comm.h涓畾涔変簡鐨勶紝 const string INF_INFO_NAME("./Data/sun.iidx");
//鍊掓帓绱㈠紩鏂囦歡绱㈠紩瀛楀拰鏂囨。濂戒箣闂存湁涓涓埗琛ㄧ"\t"
//鏈卞痙 14383 16151 16151 16151 1683 207 6302 7889 8218 8218 8637
//鏈卞彜鍔?nbsp; 1085 1222
if (!ifsInvInfo) {
cerr << "Cannot open " << INF_INFO_NAME << " for input\n";
return false;
}
string strLine, strWord, strDocNum;
//浠ヨ璇誨彇杈撳叆嫻佺紦鍐插埌瀛楃涓插璞trLine涓茍鍋氬鐞?br> while (getline(ifsInvInfo, strLine)) {
string::size_type idx;
string tmp;
idx = strLine.find("\t");
strWord = strLine.substr(0,idx);
strDocNum = strLine.substr(idx+1);
mapBuckets.insert(map<string,string>::value_type (strWord, strDocNum)); //鍊掓帓琛ㄤ簩欏逛簩緇磋〃瀛樺叆鏄犲皠涓?br>
/*
map<string, string>::iterator iter;
int kkk = 0;
for (iter = mapBuckets.begin(); kkk != 10; ++iter)
{
cout<<iter->first<<" "<<iter->second<<"<br>";
++kkk;
}
cout.flush();
*/
}
return true;
}
5銆丟etDocIdx
bool CQuery::GetDocIdx(vector &vecDocIdx) const
{
ifstream ifs(DOC_IDX_NAME.c_str(), ios::binary);
//0 0 bc9ce846d7987c4534f53d423380ba70
//1 76760 4f47a3cad91f7d35f4bb6b2a638420e5
//2 141624 d019433008538f65329ae8e39b86026c
if (!ifs) {
cerr << "Cannot open " << DOC_IDX_NAME << " for input\n"; //浠ヤ簩榪涘埗褰㈠紡鎵撳紑涓涓枃浠剁殑杈撳叆嫻佺紦鍐詫紝DOC_IDX_NAME鍦ㄥご鏂囦歡Comm.h涓畾涔変簡鐨勶紝 const string INF_INFO_NAME("./Data/Doc.idx");
return false;
}
string strLine, strDocid, strUrl;
while (getline(ifs,strLine)){
DocIdx di;
sscanf( strLine.c_str(), "%d%d", &di.docid, &di.offset ); //鍙繚鐣欎簡鍓嶉潰涓ら」鏂囨。鍙峰拰鍋忕Щ閲?br> vecDocIdx.push_back(di); //瀵煎叆緇撴瀯浣撳悜閲忎腑
}
return true;
}
鏈潃榛戝綺劇鎴戝皢闄嗙畫鎶婃渶榪戝垎鏋愭敞閲奣SE鎼滅儲(chǔ)寮曟搸鐨勫績寰楀彂甯冨嚭鏉ワ紝鑰侀笩錛屽ぇ铏撅紝澶х墰錛岄珮鎵嬮榪囧氨鏄簡錛岃嫢鎰挎剰嫻垂鎸囩偣涓嬪皬寮熺殑鍦ㄤ笅涓嶇敋鎰熸縺錛屾湁闂鐨勬湅鍙嬬洿鎺ョ暀璦璁ㄨ銆傜敱浜庢湰浜烘按騫蟲湁闄愶紝鍒嗘瀽鍜岀炕璇戦毦鍏嶆湁閿欏ぇ瀹惰絎戜簡銆?/p>
涓婂鏈熸嫓璇諱簡James F.Kurose钁楃殑銆婅綆楁満緗戠粶-鑷《鍚戜笅鏂規(guī)硶涓巌nternet鐗硅壊(絎笁鐗堥槾褰?銆嬶紝瑙夊緱鍐欏緱紜疄涓嶉敊(甯屾湜娌$湅鐨勬湅鍙嬩竴瀹氳涔版潵鐪嬬湅)錛岃嚜宸變篃鏉ユ悶涓珮鑷《鍚戜笅鐨勫涔?fàn)鏂规硶锛屽厛浠庣敤鎴风湅寰楀埌鐨勪笢瑗垮嚭鍙戝垎鏋愮爺绌舵悳鐑?chǔ)寮曟搸錛屼笅闈㈡垜浠氨鏉ョ湅鐪嬪悇澶ф悳绱㈠紩鎿庢悳绱㈢晫闈㈢殑浠g爜錛屼綘鎵闇瑕佺壒鍒敞鎰忕殑鏄痜orm琛ㄥ崟涓殑action
闆呰檸http://www.yohoo.com/錛?/p>
<form name=s1 style="margin-bottom:0" action="<table cellpadding=0 cellspacing=0 border=0><tr><td>
<input type=text size=30 name=p title="enter search terms here">
<input type=submit value=Search> </td><td><font face=arial size=-2>·
<a href=" search</a><br>·
<a href=" popular</a></font></td></tr></table></form>
璋鋒瓕http://www.g.cn錛?/p>
<form method=GET action=/search><tr><td nowrap>
<font size=-1><input type=text name=q size=41 maxlength=2048 value="jrckkyy" title="Google 鎼滅儲(chǔ)"> <input type=submit name=btnG value="Google 鎼滅儲(chǔ)"><input type=hidden name=complete value=1><input type=hidden name=hl value="zh-CN"><input type=hidden name=newwindow value=1><input type=hidden name=sa value="2"></font></td></tr></form>
鐧懼害http://www.baidu.com錛?/p>
<form name=f2 action="/s">
<tr valign="middle">
<td nowrap>
<input type=hidden name=ct value="0">
<input type=hidden name=ie value="gb2312">
<input type=hidden name=bs value="jrckkyy">
<input type=hidden name=sr>
<input type=hidden name=z value="">
<input type=hidden name=cl value=3>
<input type=hidden name=f value=8>
<input name=wd size="35" class=i value="jrckkyy" maxlength=100>
<input type=submit value=鐧懼害涓涓?gt; <input type=button value=緇撴灉涓壘 onclick="return bq(f2,1,0);"> </td>
<td nowrap><a href="</tr>
</form>
澶╃綉http://www.tianwang.com/錛?/p>
<form name=f action="/cgi-bin/tw" method=get>
<td valign=center width=634 background=images/index_image_02.gif>
<table height=46 cellspacing=0 cellpadding=0 width=600 align=right border=0>
<tbody>
<tr>
<td height=50>
<table cellspacing=0 cellpadding=0 width=600 border=0>
<tbody>
<tr>
<td width="524" height="30" valign="bottom">
<div align="center"> <input name="word" type="text" size="40" maxlength="255" onClick="this.focus();checkWord(this,1)" onblutesr='checkWord(this,0)' value='璇瘋緭鍏ヨ祫婧愬悕縐?>
<font color=#ffffff>
<select onChange=reRange(this.selectedIndex) name=range>
<script language=javascript>...
<!--
for(var i = 0; i < rescode.length; i++) ...{
if(i == 0) ...{
document.write('<option value="0" selected>' + rescode[i][0] + '</option>');
} else ...{
document.write('<option value="' + i + '">' + rescode[i][0] + '</option>');
}
}
document.f.range.selectedIndex = 0;
-->
</script>
</select>
</font>-<font color=#ffffff>
<select name=cd>
<script language=javascript>...
<!--
var ind = document.f.range.selectedIndex;
var len = (rescode[ind].length - 1) / 2;
var sel = 0;
for(var i = 0; i < len; i++) ...{
document.write('<option value="' + rescode[ind][2*i+1] + '">' + rescode[ind][2*i+2] + '</option>');
if(rescode[ind][2*i+1] == 0)
sel = i;
}
document.f.cd.selectedIndex = sel;
-->
</script>
</select>
</font></div>
</td>
<td width="71" valign="bottom"><input id=submit2 type=image height=22 width=40 src="images/so2.gif" align=absMiddle name=submit></td>
</tr>
<tr>
<td colspan=3 height=25 class=style16>
<div align=center></div>
</td>
</tr>
</tbody>
</table>
</td>
</tr>
</tbody>
</table>
</td>
</form>
嫻嬭瘯鏈嶅姟鍣═SE錛?/p>
<form method="get" action="/cgi-bin/index/TSESearch" name="tw">
<td width="100%" height="25" align="center">
<input type="text" name="word" size="55">
<input type="submit" value=" 鎼滅儲(chǔ)" name="www">
</td>
<input type="hidden" name="cdtype" value="GB">
</form>
鐢變互涓婂嚑涓猣orm鐨勫睘鎬у彲浠ョ湅鍑哄叏閮ㄩ噰鐢ㄧ殑鏄痝et鏂規(guī)硶錛孋GI鍋氫負(fù)澶勭悊紼嬪簭錛屼篃灝辨槸C/C++錛孋GI鍏ㄧО鏄?#8220;鍏叡緗戝叧鐣岄潰”(Common Gateway Interface)錛孒TTP鏈嶅姟鍣ㄤ笌浣犵殑鎴栧叾瀹冩満鍣ㄤ笂鐨勭▼搴忚繘琛?#8220;浜よ皥”鐨勪竴縐嶅伐鍏鳳紝鍏剁▼搴忛』榪愯鍦ㄧ綉緇滄湇鍔″櫒涓娿侰GI閫愭笎琚繎鍑犲勾鏉ョ殑PHP錛孞AVA錛孉SP錛孭ERL錛孭ython錛孯uby絳夊姩鎬佽璦鎵鍙栦唬銆備絾鏄叾鍦ㄩ熷害鍜岃繍琛屾晥鐜囦笂鐨勪紭鍔挎槸鏃犳硶鍙栦唬鐨勩?/p>
浠ヤ笅鏄疶SE CGI鍏ュ彛紼嬪簭娉ㄩ噴錛屽叾浠栨悳绱㈠紩鎿庣殑鍏ュ彛涔熷簲璇ョ被浼?/p>
/**//**
* 紼嬪簭緲昏瘧璇存槑
* @Copyright (c) 2008, 鐮斿彂閮?br> * All rights reserved.
*
* @filesource TSESearch.cpp
* @author jrckkyy <jrckkyy@163.com>
*
* Let's start
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/time.h>
#include <unistd.h>
#include <iostream>
#include <fstream>
#include <list>
#include "Comm.h" //鍖呭惈2涓儲(chǔ)寮曞拰1涓暟鎹枃浠?br>#include "Query.h" //鍖呭惈鏁版嵁鏌ヨ澶勭悊澶存枃浠?br>#include "Document.h" //html鏂囨。澶勭悊澶存枃浠?br>#include "StrFun.h" //瀛楃涓插鐞嗗ご鏂囦歡
#include "ChSeg/Dict.h" //瀛楀厓瀛楀吀澶勭悊澶存枃浠?br>#include "ChSeg/HzSeg.h"
#include "DisplayRst.h" //榪斿洖鏌ヨ緇撴灉欏甸潰澶存枃浠訛紝榪斿洖緇撴灉鍒嗕負(fù)澶撮儴錛屼腑閮紝搴曢儴
using namespace std;
/**//*
* A inverted file(INF) includes a term-index file & a inverted-lists file.
* A inverted-lists consists of many bucks(posting lists).
* The term-index file is stored at vecTerm, and
* the inverted-lists is sored at mapBuckets.
*/
/**//**
* 紼嬪簭緲昏瘧璇存槑
* 鎼滅儲(chǔ)紼嬪簭鍏ュ彛鍓嶅彴鍏抽敭瀛楁彁浜ゅ埌璇gi紼嬪簭 渚嬪錛?/cgi-bin/index/TSESearch?word=123&start=1
* 鍊掓帓鏂囦歡鍖呮嫭涓涓褰曟绱㈣瘝鏂囦歡鍜屼竴涓掓帓鍒楄〃鏂囦歡銆?br> * 鍊掓帓鍒楄〃鍖呭惈寰堝鏍囧織錛堟彁浜ゅ悕鍗曪級銆?br> * 璁板綍媯(gè)绱㈣瘝鏂囦歡浣跨敤vecTerm鏉ユ帓搴忥紝鍜屽掓帓鍒楄〃鏄敤mapBuckets鏉ユ帓搴忋?br> *
* @access public
* @param int char 鍙傛暟鐨勬眽瀛楄鏄?鐢ㄤ簬鎺ユ敹鍓嶅彴get浼犻掔殑鍙傛暟
* @return string 0
*/
int main(int argc, char* argv[])
...{
struct timeval begin_tv, end_tv;
struct timezone tz;
CDict iDict;
map<string, string> dictMap, mapBuckets;
vector<DocIdx> vecDocIdx; //Document銆俬
CQuery iQuery;
iQuery.GetInputs(); //鍏蜂綋紼嬪簭寮濮嬫墽琛?br> // current query & result page number
iQuery.SetQuery();
iQuery.SetStart();
// begin to search
//寮濮嬪叿浣撴悳绱㈢▼搴?br> gettimeofday(&begin_tv,&tz); //寮濮嬭鏃惰幏鍙栫▼搴忚繍琛屾椂闂村樊
iQuery.GetInvLists(mapBuckets); //灝嗘墍鏈夊瓧絎﹂泦瀛樺叆鏄犲皠鍙橀噺涓?nbsp; 鐡墮鎵鍦?br> iQuery.GetDocIdx(vecDocIdx); //灝嗗掓帓绱㈠紩瀛樺叆鍚戦噺涓?nbsp; 鐡墮鎵鍦?br>
CHzSeg iHzSeg; //include ChSeg/HzSeg.h
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //灝唃et鍒扮殑鏌ヨ鍙橀噺鍒嗚瘝鍒嗘垚 "鎴? 鐖? 浣犱滑/ 鐨? 鏍煎紡"
vector<string> vecTerm;
iQuery.ParseQuery(vecTerm); //灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑
set<string> setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
gettimeofday(&end_tv,&tz);
// search end
//鎼滅儲(chǔ)瀹屾瘯
//涓嬮潰寮濮嬫樉紺?br> CDisplayRst iDisplayRst;
iDisplayRst.ShowTop();
float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000
+((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000;
iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec,
setRelevantRst.size(), iQuery.m_iStart);
iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart);
return 0;
}