Word2Vec里实现Huffman树

word2vec里是拿数组实现word2vec,效率很高,在学校里经常见到的是递归迭代实现Huffman树,这对于处理大量叶子节点的问题不是一个最佳方法。
数组法:

#include 
#include 
#include 
#define MAX_CODE_LENGTH 40//宏定义没有;
struct vocab_word
{
    long long cn;
    int * point;
    char *word,*code,codelen;   
};
long long vocab_size,a,b,k,min1,min2,i;//词汇表大小
struct vocab_word * vocab;


int VocabCompare(const void *a,const void *b)
{
    return *((long long *)b)-*((long long *)a);
}

int main()
{
    freopen("input.txt","r",stdin);
    scanf("%lld",&vocab_size);
    //printf("vocab_size:%lld\n",vocab_size);
    vocab = (struct vocab_word *)calloc(vocab_size,sizeof(struct vocab_word));
    for (i = 0; i < vocab_size; ++i)
    {
        vocab[i].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
        vocab[i].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); 
        //printf("allocate memory to vocab[%lld]\n",i);
    }

    long long *count = (long long *)calloc(vocab_size*2-1,sizeof(long long));
    long long *binary = (long long *)calloc(vocab_size*2-1,sizeof(long long));
    long long *parent_node = (long long *)calloc(vocab_size*2-1,sizeof(long long));
    long long point[MAX_CODE_LENGTH];
    char code[MAX_CODE_LENGTH];
    for (i = 0; i < vocab_size; ++i)
    {   scanf("%lld",&count[i]);
        //printf("%lld",count[i]);
    }

    //for (i = 0; i < vocab_size; ++i) printf("%lld ",count[i]);
    for (i = vocab_size; i < vocab_size*2-1; ++i) 
        count[i]=1e15;

    //sort
    qsort(count,vocab_size,sizeof(long long),VocabCompare);
    //for (i = 0; i < vocab_size; ++i) printf("%lld ",count[i]);

    for (i = 0; i < vocab_size; ++i)
        vocab[i].cn=count[i];

    //for (i = 0; i < vocab_size; ++i) printf("%lld ",vocab[i].cn);

    long long pos1 = vocab_size-1;
    long long pos2 = vocab_size;

    for (a = 0; a < vocab_size-1; ++a)//迭代vocab_size-1次构造huffman树
    {
        //每次寻找两个最小的点min1和min2(次小),最小点为0,次小点为1
        if (pos1>=0)
        {
            if (count[pos1]else{
                min1 = pos2;
                pos2++;
            }

        }else{
            min1 = pos2;
            pos2++;
        }

        if (pos1>=0)
        {
            if (count[pos1]else{
                min2=pos2;
                pos2++;
            }

        }else{
            min2 = pos2;
            pos2++;
        }
        //printf("count[%lld]=%lld count[%lld]=%lld\n",min1,count[min1],min2,count[min2]);
        count[vocab_size + a]=count[min1]+count[min2];
        //printf("count[%lld]=%lld\n",vocab_size+a,count[vocab_size+a]);
        parent_node[min1]=vocab_size+a;
        parent_node[min2]=vocab_size+a;
        binary[min2]=1;
        //printf("binary[%lld]=%lld\n",min2,binary[min2]);          
    }
    //for (i = 0; i < 2*vocab_size-1; ++i) printf("%lld ",binary[i]);
    //for (i = 0; i < 2*vocab_size-1; ++i) printf("i=%lld,parent_node:%lld\n",i,parent_node[i]);


    for (a = 0; a < vocab_size; ++a)
    {
        b=a;
        k=0;
        while(1){
            code[k] = binary[b];
            point[k] = b;
            k++;
            b=parent_node[b];
            if (b==vocab_size*2-2) break;
        }
        vocab[a].codelen=k;//huffman编码长度
        vocab[a].point[0]=vocab_size*2-2;
        for (b = 0; b < k; ++b)//逆序处理
        {
            vocab[a].code[k-b-1]=code[b];
            vocab[a].point[k-b]=point[b];
        }

        //printf("vocab[%lld].cn=%lld\n",a,vocab[a].cn);
        //printf("vocab[%lld].codelen=%d\n",a,vocab[a].codelen);
        //for ( i = 0; i < k; ++i) printf("vocab[%lld].code=%d\n",a,vocab[a].code[i]);
        //for ( i = 0; i < k+1; ++i) printf("vocab[%lld].point=%d\n",a,vocab[a].point[i]);


    }

    free(count);
    free(binary);
    free(parent_node);

    //output
    for (a = 0; a < vocab_size; ++a)
    {
        printf("vocab[%lld].cn=%lld\n",a,vocab[a].cn);
        //printf("vocab[%lld].codelen=%d\n",a,vocab[a].codelen);
        printf("code: ");
        for ( i = 0; i < vocab[a].codelen; ++i) printf("%d ",vocab[a].code[i]);
        printf("\n");
        //printf("point: ");
        //for ( i = 0; i < vocab[a].codelen+1; ++i) printf("%d ",vocab[a].point[i]);
        //printf("\n");
    }


}

迭代法:

#include 
#include 
typedef int ElemType;
struct BTreeNode
{
    ElemType data;
    struct BTreeNode * left;
    struct BTreeNode * right;
};

//1、输出二叉树,可在前序遍历的基础上修改。采用广义表格式,元素类型为int 
void PrintBTree_int(struct BTreeNode *BT)
{
    if (BT!=NULL)
    {
        printf("%d",BT->data);//输出根结点的值  
        if (BT->left!=NULL||BT->right!=NULL)
        {
            printf("(");
            PrintBTree_int(BT->left);//输出左子树  
            if (BT->right!=NULL)
                printf(",");
            PrintBTree_int(BT->right);//输出右子树  
            printf(")");
        }
    }
}
//2、根据数组 a 中 n 个权值建立一棵哈夫曼树,返回树根指针 
struct BTreeNode * CreateHuffman(ElemType a[],int n)
{
    int i,j;
    struct BTreeNode **b,*q;
    b = (struct BTreeNode **)malloc(n*sizeof(struct BTreeNode));
    for (i = 0; i < n; ++i) //初始化b指针数组,使每个指针元素指向a数组中对应的元素结点 
    {
        b[i] = (struct BTreeNode *)malloc(sizeof(struct BTreeNode));
        b[i]->data = a[i];
        b[i]->left=b[i]->right=NULL;
    }
    for (int i = 1; i < n; ++i)//进行 n-1 次循环建立哈夫曼树 
    {
        //k1表示森林中具有最小权值的树根结点的下标,k2为次最小的下标  
        int k1=-1,k2;
        for (int j = 0; j < n; ++j)//让k1初始指向森林中第一棵树,k2指向第二棵  
        {
            if (b[j]!=NULL&&k1==-1)
            {
                k1=j;
                continue;
            }
            if (b[j]!=NULL)
            {
                k2=j;
                break;
            }
        }
        for (int j = k2; j < n; ++j)
        {
            if (b[j]!=NULL)
            {
                if (b[j]->datadata)
                {
                    k2=k1;
                    k1=j;
                }
                else if (b[j]->datadata)
                {
                    k2=j;
                }
            }
        }
             //由最小权值树和次最小权值树建立一棵新树,q指向树根结点 
            q=(struct BTreeNode *)malloc(sizeof(struct BTreeNode));
            q->data = b[k1]->data + b[k2]->data;
            q->left = b[k1];
            q->right =b[k2];

            b[k1] = q;//将指向新树的指针赋给b指针数组中k1位置 
            b[k2] = NULL;//k2位置为空  


    }
    free(b);//删除动态建立的数组b  
    return q;//返回整个哈夫曼树的树根指针 

}
//3、求哈夫曼树的带权路径长度 
ElemType WeightPathLength(struct BTreeNode * FBT,int len)//len初始值为0
{
    if (FBT==NULL)//空树返回0  
    {
        return 0;   
    }
    else
    {
        if (FBT->left == NULL && FBT->right == NULL)//访问到叶子结点

            return FBT->data*len;
        else//访问到非叶子结点,进行递归调用,返回左右子树的带权路径长度之和,len递增  

            return WeightPathLength(FBT->left,len+1)+WeightPathLength(FBT->right,len+1);


    }
}
//4、哈夫曼编码(可以根据哈夫曼树带权路径长度的算法基础上进行修改) 
void HuffmanCoding(struct BTreeNode *FBT,int len)//len初始值为0
{
    static int a[10];//定义静态数组a,保存每个叶子的编码,数组长度至少是树深度减一 
    if (FBT!=NULL)  
    {
        if (FBT->left==NULL&&FBT->right==NULL)
        {
            int i;
            printf("节点权值为%d的编码",FBT->data);
            for (i = 0; i < len; ++i)
                printf("%d",a[i]);
            printf("\n");
        }else
        {
            //访问到非叶子结点时分别向左右子树递归调用,并把分支上的0、1编码保存到数组a  
            //的对应元素中,向下深入一层时len值增1
            a[len] = 0;
            HuffmanCoding(FBT->left,len+1);
            a[len]=1;
            HuffmanCoding(FBT->right,len+1);
        }
    }

}
//主函数 
int main()
{
    freopen("input.txt","r",stdin);
    int n,i;
    ElemType *a;
    struct BTreeNode * fbt;
    //printf("从键盘输入待构造的哈夫曼树中带权叶子结点数n:");  
    while(1)
    {
        scanf("%d",&n);
        printf("n:%d\n",n);
        if (n>1)
            break;
        else
            printf("重输n值:");

    }
    a=(ElemType *)malloc(n*sizeof(ElemType));
    //printf("从键盘输入%d个整数作为权值:", n); 
    for (int i = 0; i < n; ++i)
    {
        scanf("%d",&a[i]);
        printf("a[%d]=%d\n",i,a[i]);
    }
    fbt = CreateHuffman(a,n);
    printf("广义表形式的哈夫曼树:");  
    PrintBTree_int(fbt);
    printf("\n");
    printf("哈夫曼树的带权路径长度:");  
    printf("%d\n",WeightPathLength(fbt,0));
    printf("树中每个叶子结点的哈夫曼编码:\n");  
    HuffmanCoding(fbt,0);
    return 0;

}

你可能感兴趣的:(机器学习)