封装的是附件这篇paper的count
因为对比发现这个的综合性能比较好
xxx@ooo ~/zspal/zfeq/frequent-items/src $ cat Release/pyzlcl.py
#coding:utf-8
from pyzlcl import Lcl
#0.001 是要统计的频率下限
lcl = Lcl(0.001)
for i in xrange(200):
for j in xrange(100):
for k in xrange(j):
lcl.update(j, 1)
for i in xrange(1,100,30):
print i
print "出现的次数(估计值)",lcl.est(i)
print "estimate the worst case error in the estimate of a
particular item :" ,lcl.err(i)
print "---"*20
result = lcl.output(1000)
result.sort(key=lambda x:-x[1])
print result
print lcl.capacity()
xxx@ooo ~/zspal/zfeq/frequent-items/src $ python Release/pyzlcl.py
1
出现的次数(估计值) 200
estimate the worst case error in the estimate of a particular item : 0
------------------------------------------------------------
31
出现的次数(估计值) 6200
estimate the worst case error in the estimate of a particular item : 0
------------------------------------------------------------
61
出现的次数(估计值) 12200
estimate the worst case error in the estimate of a particular item : 0
------------------------------------------------------------
91
出现的次数(估计值) 18200
estimate the worst case error in the estimate of a particular item : 0
------------------------------------------------------------
[(99, 19800), (98, 19600), (97, 19400), (96, 19200), (95, 19000), (94,
18800), (93, 18600), (92, 18400), (91, 18200), (90, 18000), (89,
17800), (88, 17600), (87, 17400), (86, 17200), (85, 17000), (84,
16800), (83, 16600), (82, 16400), (81, 16200), (80, 16000), (79,
15800), (78, 15600), (77, 15400), (76, 15200), (75, 15000), (74,
14800), (73, 14600), (72, 14400), (71, 14200), (70, 14000), (69,
13800), (68, 13600), (67, 13400), (66, 13200), (65, 13000), (64,
12800), (63, 12600), (62, 12400), (61, 12200), (60, 12000), (59,
11800), (58, 11600), (57, 11400), (56, 11200), (55, 11000), (54,
10800), (53, 10600), (52, 10400), (51, 10200), (50, 10000), (49,
9800), (48, 9600), (47, 9400), (46, 9200), (45, 9000), (44, 8800),
(43, 8600), (42, 8400), (41, 8200), (40, 8000), (39, 7800), (38,
7600), (37, 7400), (36, 7200), (35, 7000), (34, 6800), (33, 6600),
(32, 6400), (31, 6200), (30, 6000), (29, 5800), (28, 5600), (27,
5400), (26, 5200), (25, 5000), (24, 4800), (23, 4600), (22, 4400),
(21, 4200), (20, 4000), (19, 3800), (18, 3600), (17, 3400), (16,
3200), (15, 3000), (14, 2800), (13, 2600), (12, 2400), (11, 2200),
(10, 2000), (9, 1800), (8, 1600), (7, 1400), (6, 1200), (5, 1000)]
44092
c中的用法演示
xxx@ooo ~/zspal/zfeq/frequent-items/src $ cat zlcl.cc
#include "prng.h"
#include "lossycount.h"
#include <iostream>
size_t RunExact(uint32_t thresh, std::vector<uint32_t>& exact);
template<class T>
void generate_data(T* data,size_t number,uint32_t u32DomainSize,double dSkew);
int main(int argc, char **argv) {
size_t stNumberOfPackets = 10000000; // 样本数
double dPhi = 0.0001; //统计频率大于dPhi的元素,这里取万分之一
uint32_t u32DomainSize = 1048575; //样本取值范围
std::vector<uint32_t> exact(u32DomainSize + 1, 0);//精确统计,以便于做对比
//生成 Zipf 分布的数据
std::vector<uint32_t> data;
generate_data(&data,stNumberOfPackets,u32DomainSize,1.0);
//将测试数据分为20段运行 每运行一段 输出一次统计数据
size_t stRuns = 20;
size_t stRunSize = data.size() / stRuns;
size_t stStreamPos = 0;
LCL_type* lcl = LCL_Init(dPhi);
for (size_t run = 1; run <= stRuns; ++run) {
for (size_t i = stStreamPos; i < stStreamPos + stRunSize; ++i) {
exact[data[i]]+=1;
}
for (size_t i = stStreamPos; i < stStreamPos + stRunSize; ++i) {
LCL_Update(lcl,data[i],1);
}
uint32_t thresh = static_cast<uint32_t>(floor(dPhi * run * stRunSize));
if (thresh == 0) thresh = 1;
std::cout<<"Thresh is "<<thresh<<std::endl;
size_t hh = RunExact(thresh, exact);
std::cout << "Run: " << run << ", Exact: " << hh << std::endl;
std::map<uint32_t, uint32_t> res;
res = LCL_Output(lcl,thresh);
std::cout << "LCL: " << run << ", Count: " << res.size() << std::endl;
stStreamPos += stRunSize;
}
LCL_Destroy(lcl);
printf("\n");
return 0;
}
size_t RunExact(uint32_t thresh, std::vector<uint32_t>& exact)
{
size_t hh = 0;
for (size_t i = 0; i < exact.size(); ++i)
if (exact[i] >= thresh) ++hh;
return hh;
}
template<class T>
void generate_data(T* data,size_t number,uint32_t u32DomainSize,double dSkew){
prng_type * prng;
prng=prng_Init(44545,2);
int64_t a = (int64_t) (prng_int(prng)% MOD);
int64_t b = (int64_t) (prng_int(prng)% MOD);
prng_Destroy(prng);
Tools::Random r = Tools::Random(0xF4A54B);
Tools::PRGZipf zipf = Tools::PRGZipf(0, u32DomainSize, dSkew, &r);
size_t stCount = 0;
for (int i = 0; i < number; ++i)
{
++stCount;
if (stCount % 500000 == 0)
std::cout <<"Generate Data " << stCount << std::endl;
uint32_t v = zipf.nextLong();
uint32_t value = hash31(a, b, v) & u32DomainSize;
data->push_back(value);
}
}
--
弓长
孝文
、
王
http://zsp.iteye.com/