word2vec里是拿数组实现word2vec,效率很高,在学校里经常见到的是递归迭代实现Huffman树,这对于处理大量叶子节点的问题不是一个最佳方法。
数组法:
#include
#include
#include
#define MAX_CODE_LENGTH 40//宏定义没有;
struct vocab_word
{
long long cn;
int * point;
char *word,*code,codelen;
};
long long vocab_size,a,b,k,min1,min2,i;//词汇表大小
struct vocab_word * vocab;
int VocabCompare(const void *a,const void *b)
{
return *((long long *)b)-*((long long *)a);
}
int main()
{
freopen("input.txt","r",stdin);
scanf("%lld",&vocab_size);
//printf("vocab_size:%lld\n",vocab_size);
vocab = (struct vocab_word *)calloc(vocab_size,sizeof(struct vocab_word));
for (i = 0; i < vocab_size; ++i)
{
vocab[i].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
vocab[i].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
//printf("allocate memory to vocab[%lld]\n",i);
}
long long *count = (long long *)calloc(vocab_size*2-1,sizeof(long long));
long long *binary = (long long *)calloc(vocab_size*2-1,sizeof(long long));
long long *parent_node = (long long *)calloc(vocab_size*2-1,sizeof(long long));
long long point[MAX_CODE_LENGTH];
char code[MAX_CODE_LENGTH];
for (i = 0; i < vocab_size; ++i)
{ scanf("%lld",&count[i]);
//printf("%lld",count[i]);
}
//for (i = 0; i < vocab_size; ++i) printf("%lld ",count[i]);
for (i = vocab_size; i < vocab_size*2-1; ++i)
count[i]=1e15;
//sort
qsort(count,vocab_size,sizeof(long long),VocabCompare);
//for (i = 0; i < vocab_size; ++i) printf("%lld ",count[i]);
for (i = 0; i < vocab_size; ++i)
vocab[i].cn=count[i];
//for (i = 0; i < vocab_size; ++i) printf("%lld ",vocab[i].cn);
long long pos1 = vocab_size-1;
long long pos2 = vocab_size;
for (a = 0; a < vocab_size-1; ++a)//迭代vocab_size-1次构造huffman树
{
//每次寻找两个最小的点min1和min2(次小),最小点为0,次小点为1
if (pos1>=0)
{
if (count[pos1]else{
min1 = pos2;
pos2++;
}
}else{
min1 = pos2;
pos2++;
}
if (pos1>=0)
{
if (count[pos1]else{
min2=pos2;
pos2++;
}
}else{
min2 = pos2;
pos2++;
}
//printf("count[%lld]=%lld count[%lld]=%lld\n",min1,count[min1],min2,count[min2]);
count[vocab_size + a]=count[min1]+count[min2];
//printf("count[%lld]=%lld\n",vocab_size+a,count[vocab_size+a]);
parent_node[min1]=vocab_size+a;
parent_node[min2]=vocab_size+a;
binary[min2]=1;
//printf("binary[%lld]=%lld\n",min2,binary[min2]);
}
//for (i = 0; i < 2*vocab_size-1; ++i) printf("%lld ",binary[i]);
//for (i = 0; i < 2*vocab_size-1; ++i) printf("i=%lld,parent_node:%lld\n",i,parent_node[i]);
for (a = 0; a < vocab_size; ++a)
{
b=a;
k=0;
while(1){
code[k] = binary[b];
point[k] = b;
k++;
b=parent_node[b];
if (b==vocab_size*2-2) break;
}
vocab[a].codelen=k;//huffman编码长度
vocab[a].point[0]=vocab_size*2-2;
for (b = 0; b < k; ++b)//逆序处理
{
vocab[a].code[k-b-1]=code[b];
vocab[a].point[k-b]=point[b];
}
//printf("vocab[%lld].cn=%lld\n",a,vocab[a].cn);
//printf("vocab[%lld].codelen=%d\n",a,vocab[a].codelen);
//for ( i = 0; i < k; ++i) printf("vocab[%lld].code=%d\n",a,vocab[a].code[i]);
//for ( i = 0; i < k+1; ++i) printf("vocab[%lld].point=%d\n",a,vocab[a].point[i]);
}
free(count);
free(binary);
free(parent_node);
//output
for (a = 0; a < vocab_size; ++a)
{
printf("vocab[%lld].cn=%lld\n",a,vocab[a].cn);
//printf("vocab[%lld].codelen=%d\n",a,vocab[a].codelen);
printf("code: ");
for ( i = 0; i < vocab[a].codelen; ++i) printf("%d ",vocab[a].code[i]);
printf("\n");
//printf("point: ");
//for ( i = 0; i < vocab[a].codelen+1; ++i) printf("%d ",vocab[a].point[i]);
//printf("\n");
}
}
迭代法:
#include
#include
typedef int ElemType;
struct BTreeNode
{
ElemType data;
struct BTreeNode * left;
struct BTreeNode * right;
};
//1、输出二叉树,可在前序遍历的基础上修改。采用广义表格式,元素类型为int
void PrintBTree_int(struct BTreeNode *BT)
{
if (BT!=NULL)
{
printf("%d",BT->data);//输出根结点的值
if (BT->left!=NULL||BT->right!=NULL)
{
printf("(");
PrintBTree_int(BT->left);//输出左子树
if (BT->right!=NULL)
printf(",");
PrintBTree_int(BT->right);//输出右子树
printf(")");
}
}
}
//2、根据数组 a 中 n 个权值建立一棵哈夫曼树,返回树根指针
struct BTreeNode * CreateHuffman(ElemType a[],int n)
{
int i,j;
struct BTreeNode **b,*q;
b = (struct BTreeNode **)malloc(n*sizeof(struct BTreeNode));
for (i = 0; i < n; ++i) //初始化b指针数组,使每个指针元素指向a数组中对应的元素结点
{
b[i] = (struct BTreeNode *)malloc(sizeof(struct BTreeNode));
b[i]->data = a[i];
b[i]->left=b[i]->right=NULL;
}
for (int i = 1; i < n; ++i)//进行 n-1 次循环建立哈夫曼树
{
//k1表示森林中具有最小权值的树根结点的下标,k2为次最小的下标
int k1=-1,k2;
for (int j = 0; j < n; ++j)//让k1初始指向森林中第一棵树,k2指向第二棵
{
if (b[j]!=NULL&&k1==-1)
{
k1=j;
continue;
}
if (b[j]!=NULL)
{
k2=j;
break;
}
}
for (int j = k2; j < n; ++j)
{
if (b[j]!=NULL)
{
if (b[j]->datadata)
{
k2=k1;
k1=j;
}
else if (b[j]->datadata)
{
k2=j;
}
}
}
//由最小权值树和次最小权值树建立一棵新树,q指向树根结点
q=(struct BTreeNode *)malloc(sizeof(struct BTreeNode));
q->data = b[k1]->data + b[k2]->data;
q->left = b[k1];
q->right =b[k2];
b[k1] = q;//将指向新树的指针赋给b指针数组中k1位置
b[k2] = NULL;//k2位置为空
}
free(b);//删除动态建立的数组b
return q;//返回整个哈夫曼树的树根指针
}
//3、求哈夫曼树的带权路径长度
ElemType WeightPathLength(struct BTreeNode * FBT,int len)//len初始值为0
{
if (FBT==NULL)//空树返回0
{
return 0;
}
else
{
if (FBT->left == NULL && FBT->right == NULL)//访问到叶子结点
return FBT->data*len;
else//访问到非叶子结点,进行递归调用,返回左右子树的带权路径长度之和,len递增
return WeightPathLength(FBT->left,len+1)+WeightPathLength(FBT->right,len+1);
}
}
//4、哈夫曼编码(可以根据哈夫曼树带权路径长度的算法基础上进行修改)
void HuffmanCoding(struct BTreeNode *FBT,int len)//len初始值为0
{
static int a[10];//定义静态数组a,保存每个叶子的编码,数组长度至少是树深度减一
if (FBT!=NULL)
{
if (FBT->left==NULL&&FBT->right==NULL)
{
int i;
printf("节点权值为%d的编码",FBT->data);
for (i = 0; i < len; ++i)
printf("%d",a[i]);
printf("\n");
}else
{
//访问到非叶子结点时分别向左右子树递归调用,并把分支上的0、1编码保存到数组a
//的对应元素中,向下深入一层时len值增1
a[len] = 0;
HuffmanCoding(FBT->left,len+1);
a[len]=1;
HuffmanCoding(FBT->right,len+1);
}
}
}
//主函数
int main()
{
freopen("input.txt","r",stdin);
int n,i;
ElemType *a;
struct BTreeNode * fbt;
//printf("从键盘输入待构造的哈夫曼树中带权叶子结点数n:");
while(1)
{
scanf("%d",&n);
printf("n:%d\n",n);
if (n>1)
break;
else
printf("重输n值:");
}
a=(ElemType *)malloc(n*sizeof(ElemType));
//printf("从键盘输入%d个整数作为权值:", n);
for (int i = 0; i < n; ++i)
{
scanf("%d",&a[i]);
printf("a[%d]=%d\n",i,a[i]);
}
fbt = CreateHuffman(a,n);
printf("广义表形式的哈夫曼树:");
PrintBTree_int(fbt);
printf("\n");
printf("哈夫曼树的带权路径长度:");
printf("%d\n",WeightPathLength(fbt,0));
printf("树中每个叶子结点的哈夫曼编码:\n");
HuffmanCoding(fbt,0);
return 0;
}