我爱分享----百万商业圈C语言实现的倒排索引算法(含全部源码)
#include <stdio.h>
#include <stdlib.h>
char chr_legal[] = "abcdefghijklmnopqrstuvwxyz0123456789_-./";
int chr_idx[256] = {0};
char idx_chr[256] = {0};
#define FNAME 0
typedef struct trie_t *trie, trie_t;
struct trie_t {
trie next[sizeof(chr_legal)]; /* next letter; slot 0 is for file name */
int eow;
};
trie trie_new() { return calloc(sizeof(trie_t), 1); }
#define find_word(r, w) trie_trav(r, w, 1)
/* tree traversal: returns node if end of word and matches string, optionally
* create node if doesn't exist
*/
trie trie_trav(trie root, char * str, int no_create)
{
int c;
while (root) {
if ((c = str[0]) == '\0') {
if (!root->eow && no_create) return 0;
break;
}
if (! (c = chr_idx[c]) ) {
str++;
continue;
}
if (!root->next[c]) {
if (no_create) return 0;
root->next[c] = trie_new();
}
root = root->next[c];
str++;
}
return root;
}
/* complete traversal of whole tree, calling callback at each end of word node.
* similar method can be used to free nodes, had we wanted to do that.
*/
int trie_all(trie root, char path[], int depth, int (*callback)(char *))
{
int i;
if (root->eow && !callback(path)) return 0;
for (i = 1; i < sizeof(chr_legal); i++) {
if (!root->next[i]) continue;
path[depth] = idx_chr[i];
path[depth + 1] = '\0';
if (!trie_all(root->next[i], path, depth + 1, callback))
return 0;
}
return 1;
}
void add_index(trie root, char *word, char *fname)
{
trie x = trie_trav(root, word, 0);
x->eow = 1;
if (!x->next[FNAME])
x->next[FNAME] = trie_new();
x = trie_trav(x->next[FNAME], fname, 0);
x->eow = 1;
}
int print_path(char *path)
{
printf(" %s", path);
return 1;
}
/* pretend we parsed text files and got lower cased words: dealing *
* with text file is a whole other animal and would make code too long */
char *files[] = { "f1.txt", "source/f2.txt", "other_file" };
char *text[][5] ={{ "it", "is", "what", "it", "is" },
{ "what", "is", "it", 0 },
{ "it", "is", "a", "banana", 0 }};
trie init_tables()
{
int i, j;
trie root = trie_new();
for (i = 0; i < sizeof(chr_legal); i++) {
chr_idx[(int)chr_legal[i]] = i + 1;
idx_chr[i + 1] = chr_legal[i];
}
/* Enable USE_ADVANCED_FILE_HANDLING to use advanced file handling.
* You need to have files named like above files[], with words in them
* like in text[][]. Case doesn't matter (told you it's advanced).
*/
#define USE_ADVANCED_FILE_HANDLING 0
#if USE_ADVANCED_FILE_HANDLING
void read_file(char * fname) {
char cmd[1024];
char word[1024];
sprintf(cmd, "perl -p -e 'while(/(\\w+)/g) {print lc($1),\"\\n\"}' %s", fname);
FILE *in = popen(cmd, "r");
while (!feof(in)) {
fscanf(in, "%1000s", word);
add_index(root, word, fname);
}
pclose(in);
};
read_file("f1.txt");
read_file("source/f2.txt");
read_file("other_file");
#else
for (i = 0; i < 3; i++) {
for (j = 0; j < 5; j++) {
if (!text[i][j]) break;
add_index(root, text[i][j], files[i]);
}
}
#endif /*USE_ADVANCED_FILE_HANDLING*/
return root;
}
void search_index(trie root, char *word)
{
char path[1024];
printf("Search for \"%s\": ", word);
trie found = find_word(root, word);
if (!found) printf("not found\n");
else {
trie_all(found->next[FNAME], path, 0, print_path);
printf("\n");
}
}
int main()
{
trie root = init_tables();
search_index(root, "what");
search_index(root, "is");
search_index(root, "banana");
search_index(root, "boo");
return 0;
}
百万商业圈小型企业云ERP总体开发规划
软件工程概述 - 企业架构 - IT企业做大做强之根本 - 之我见
实践检验得出的真理:asp.net 项目在 linux mono上编译需要修改的只有 3个地方
给大家漏一手本人亲自精心撰写的通用ajax框架,完全兼容 IE FireFox各个版本!(附完整源码及完整范例)
开发了一个中文分词服务器(C语言开发+词库+源代码),最大特色可以让javascript来调用!
用纯C语言写了一个HtmlParse(网页分析器)外带采集功能,大小只有200K(免费+开源+操作示意图)
开源:完全自主研发搜索引擎1.0源代码及说明,单机400万网页,任意50词以内的检索不超过 20毫秒
开源:基于百万商业圈.NET开发框架开发的并行带分词的采集器
百万商业圈 .NET 开发框架2.0及开发框架API说明书(BWFW)(含并行计算及中英文分词功能)
分享一点代码(小型C web开发框架),用C语言实现的一个WEB 文件上传(含全部源代码)一
天心天字辈ERP全部PDK源代码到了我手上的后果 - 超越天心之WEB天云
大家看看我的BS的甘特图排程做的如何? 无刷新Ajax甘特图 展示生产排程结果 演示