从海量数据中找出重复次数最多的一个

从海量数据中找出重复次数最多的一个  

2011-05-31 14:12:38|  分类: 海量数据处理|字号 订阅

/************************************************************************/
/* 从海量数据中找出重复次数最多的一个
   思路:先将海量数据通过哈希表统计出数据的频率并映射为100个小文件,小文件
        中的数据包括两项(数值,出现次数),然后再对每一个小文件求出重复次数
  最多的一个数据然后将各个小文件出现最多的数据项目通过二路归并进行比
  较,找出频率最大的即为所求
   性能:时间复杂度:O(N)+100*O(N1)+O(nlogn)*/
/************************************************************************/

#include<iostream>
#include<fstream>
#include<malloc.h>
#include<stdlib.h>
const int ERROR=0;
using namespace std;

struct LinkHash//哈希表
{
 LinkHash *next;
 int m_nValue;
 int count;//数据出现的次数
};
struct _Data//数据结构体
{
 int Value;
 int Count;
};
char *file[101]=
{"file1.txt","file2.txt","file3.txt","file4.txt","file5.txt","file6.txt","file7.txt","file8.txt","file9.txt","file10.txt",
"file11.txt","file12.txt","file13.txt","file14.txt","file15.txt","file16.txt","file17.txt","file18.txt","file19.txt","file20.txt",
"file21.txt","file22.txt","file23.txt","file24.txt","file25.txt","file26.txt","file27.txt","file28.txt","file29.txt","file30.txt",
"file31.txt","file32.txt","file33.txt","file34.txt","file35.txt","file36.txt","file37.txt","file38.txt","file39.txt","file40.txt",
"file41.txt","file42.txt","file43.txt","file44.txt","file45.txt","file46.txt","file47.txt","file48.txt","file49.txt","file50.txt",
"file51.txt","file52.txt","file53.txt","file54.txt","file55.txt","file56.txt","file57.txt","file58.txt","file59.txt","file60.txt",
"file61.txt","file62.txt","file63.txt","file64.txt","file65.txt","file66.txt","file67.txt","file68.txt","file69.txt","file70.txt",
"file71.txt","file72.txt","file73.txt","file74.txt","file75.txt","file76.txt","file77.txt","file78.txt","file79.txt","file80.txt",
"file81.txt","file82.txt","file83.txt","file84.txt","file85.txt","file86.txt","file87.txt","file88.txt","file89.txt","file90.txt",
"file91.txt","file92.txt","file93.txt","file94.txt","file95.txt","file96.txt","file97.txt","file98.txt","file99.txt","file100.txt"};
class CHashTable
{
private:
 LinkHash *HashTable[101];//10个空哈希表头
public:
 CHashTable();
 ~CHashTable();

 void HashCollision(int data);
 void WriteToFile();
 _Data GetMaxFreq(char *filename);

};
CHashTable::CHashTable()
{
 
 int i;
 for(i=0;i<100;i++)//初始化空链表
 {
  HashTable[i]=(LinkHash*)malloc(sizeof(LinkHash));
  if(!HashTable[i])
   exit(ERROR);
  HashTable[i]->count=0;
  HashTable[i]->next=NULL;
  HashTable[i]->m_nValue=-1;
 }
}
CHashTable::~CHashTable()
{

}
int HashFunc(int key)//哈希函数
{
 return key%100;
}

void CHashTable::HashCollision(int data)//链地址法处理冲突
{
 LinkHash *newNode;
 LinkHash *head;
 newNode=(LinkHash*)malloc(sizeof(LinkHash));
 if(!newNode)
  exit(ERROR);
 newNode->next=NULL;
 newNode->m_nValue=data;
 newNode->count=0;

 int p;
 bool isRep=false;//重复出现
 p=HashFunc(data);
 head=HashTable[p];
 while(head->next)
 {
  head=head->next;
  if(head->m_nValue==data)
  {
   head->count++;//有重复的数据统计出现的次数
   isRep=true;
   break;
  }
  
 }
 if(isRep==false)//如果没有重复的数据,则将数据插入
 {
  head->next=newNode;
     head=newNode;
     head->count++;
 }

 
}
void CHashTable::WriteToFile()//将结果写入100个小文件中
{
 int i;
 ofstream fout;
 for(i=0;i<100;i++)
 {
  LinkHash *p;
  fout.open(file[i]);
   if(HashTable[i]->next)
   {
    p=HashTable[i]->next;
    while(p)
    {
     fout<<p->m_nValue<<" "<<p->count<<endl;
     p=p->next;
    }
   }
  fout.close();
  fout.clear();
 }
}

_Data CHashTable::GetMaxFreq(char *filename)//遍历文件中数据执行次数T(n)=O(N1)
{//从文件中获取出现频率最多的数据
 fstream fin;
 _Data InData;
 _Data MaxData;
 MaxData.Count=0;
 fin.open(filename);
 if(fin.is_open())
 {
  while(fin>>InData.Value>>InData.Count)
  {
   if(InData.Count>MaxData.Count)
    MaxData=InData;
  }
 }
 fin.close();
 return MaxData;
}
void BiSearchMax(_Data Array[],int start,int end,_Data &Max)//二路归并数组中数据频率最大的值
{
 _Data Max1;
 Max1.Count=-1;
 int mid;
 if(start==end)
  Max=Array[start];
 else if(end-start+1==2)
 {
  if(Array[start].Count>Array[end].Count)
   Max=Array[start];
  else
   Max=Array[end];
 }
 else
 {
      mid=(start+end)/2;
    BiSearchMax(Array,start,mid,Max);
    BiSearchMax(Array,mid+1,end,Max1);

 }
 if(Max1.Count>Max.Count)
  Max=Max1;
}
int main()
{
 CHashTable HTable=CHashTable();
    fstream fin;
 ofstream fout;
 int i,data,indata;
 _Data FileData[101];
 _Data MaxFreq;
 MaxFreq.Count=0;
 
 fout.open("input.txt");
 for(i=0;i<1000000;i++)//生成100000个数据
 {
  data=1+rand()%1000;//从1到1000的随机数
  fout<<data<<" ";
 }
 fout.close();
 
 fin.open("input.txt");
 if(fin.is_open())
 {
  while(fin>>indata)//对海量数据进行遍历执行次数T(n)=N,时间复杂度O(N)
     { 
       HTable.HashCollision(indata);
     }
 }
  
 HTable.WriteToFile();
    for(i=0;i<100;i++)//分别获取100个文件中频率最大的数据,执行次数100*N1(N1为100个文件平均长度),时间复杂度100*O(N1)
 {
  FileData[i]=HTable.GetMaxFreq(file[i]);
    cout<<FileData[i].Value<<" "<<FileData[i].Count<<endl;
 }
 BiSearchMax(FileData,0,99,MaxFreq);//对100个数据进行二路归并查找最大值,时间复杂度O(nlogn)
  cout<<"出现最多的是"<<MaxFreq.Value<<" "<<MaxFreq.Count<<endl;
    
 return 1;
 

}

你可能感兴趣的:(从海量数据中找出重复次数最多的一个)