unixfy
just do it

C++博客 :: 首頁 :: 新隨筆 :: 聯系 :: 聚合

:: 管理

posts - 183, comments - 10, trackbacks - 0

<

2011年3月

>

日

一

二

三

四

五

六

27

28

1

2

3

5

7

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

1

2

3

4

5

6

7

8

9

常用鏈接

留言簿(3)

隨筆檔案

搜索

閱讀排行榜

評論排行榜

尋找熱點查詢

搜索引擎會通過日志文件把用戶每次檢索使用的所有檢索串都記錄下來，每個查詢串的長度為1-255字節。
假設目前有一千萬個記錄（這些查詢串的重復度比較高，雖然總數是1千萬，但如果除去重復后，不超過3百萬個。一個查詢串的重復度越高，說明查詢它的用戶越多，也就是越熱門。請你統計最熱門的10個查詢串，要求使用的內存不能超過1G。

先統計所有查詢的次數，所有查詢有 300 萬個，255 * 300 * 10000B = 765 MB，可以存入內存。這里使用 STL 中的 map。所得時間復雜度為 O(NlogM)，N 為所有的查詢，包括重復的，M 為不重復的查詢。更好的方法是用散列。

然后遍歷 map，維護一個大小為 10 的集合，在遍歷 map 時，比較當前查詢的出現次數與集合中出現次數最小的查詢的出現此時比較，如果大于，將當前查詢替換到集合中。
這里的集合還是用的 map，時間復雜度為 O（MlogK），這里 K = 10。

總的時間復雜度為 O(NlogM) + O(MlogK)

也可以將這個過程合二為一。即每次在統計的過程中，查詢大小為 K 的集合。如果符合條件，則將當前查詢替換到集合中。但是還要考慮實時更新集合中的元素。
這種方法的時間復雜度為 O(N(logM + logK + K))。

由于第二種方法還得考慮實時更新。效率遠沒有第一種方案高。

實現：

1 #include <iostream>
2 #include <fstream>
3 #include <map>
4 #include <string>
5 using namespace std;
6
7 void statistics(map<string, int>& data, const string& query)
8 {
9     ++data[query];
10 }
11
12 void findTopK(multimap<int, string>& topK, int k, const map<string, int>& data)
13 {
14     topK.clear();
15     for (map<string, int>::const_iterator cit = data.begin(); cit != data.end(); ++cit)
16     {
17         if (topK.size() < k)
18         {
19             topK.insert(make_pair(cit->second, cit->first));
20         }
21         else
22         {
23             if (cit->second > topK.begin()->first)
24             {
25                 topK.erase(topK.begin());
26                 topK.insert(make_pair(cit->second, cit->first));
27             }
28         }
29     }
30 }
31
32 int main()
33 {
34     ifstream fin("queryfile.txt");
35     map<string, int> data;
36     multimap<int, string> top10;
37     string query;
38     while (getline(fin, query))
39     {
40         statistics(data, query);
41     }
42
43     //for (map<string, int>::const_iterator cit = data.begin(); cit != data.end(); ++cit)
44     //{
45     //    cout << cit->first << '\t' << cit->second << endl;
46     //}
47
48     //cout << endl;
49     findTopK(top10, 10, data);
50
51     for (multimap<int, string>::const_reverse_iterator cit = top10.rbegin(); cit != top10.rend(); ++cit)
52     {
53         cout << cit->second << '\t' << cit->first << endl;
54     }
55
56     return 0;
57 }

http://blog.donews.com/jiji262/2011/03/baidu_top_k_interview/
http://blog.redfox66.com/post/2010/09/23/top-k-algoriyhm-analysis.aspx
http://blog.csdn.net/jasonblog/archive/2010/08/19/5825026.aspx

posted on 2011-04-30 18:06 unixfy 閱讀(210) 評論(0) 編輯收藏引用

只有注冊用戶登錄后才能發表評論。
【推薦】100%開源！大型工業跨平臺軟件C++源碼提供，建模，組態！



網站導航: 博客園 IT新聞 BlogJava 博問 Chat2DB 管理

青青草原综合久久大伊人导航_色综合久久天天综合_日日噜噜夜夜狠狠久久丁香五月_热久久这里只有精品

常用鏈接

留言簿(3)

隨筆檔案

搜索

最新評論

閱讀排行榜

評論排行榜