今天在公司学了网页抓取,感觉在学校C就学了个皮毛,到了公司啥都不懂。做个简单的总结
1、建工程不在一个文件夹,调用函数时,写的头文件要带路径,如#include "lyPublic/lyCodeConvert.c"
2、窗口事件要修改 工程-设置-连接的“/subsystem:console /incremental:yes” ,改为“/subsystem:windows /incremental:yes”
3、在抓取网页时,传递的网址,UTF-8和GBK之间的转换。要将GBK转为UTF-8后再打开,不然会丢失关键词
今天成果:
#include <stdlib.h> #include <stdio.h> #include <string.h> #include "lyGetHttpResult.h" #include "lyPublic/lyCodeConvert.c" int main() { char szUrl[512] = ""; char svData[1024 * 40] = ""; char *szData = NULL; FILE *fp; char *p, *q; char strFrom[100]="",strTo[100]=""; int len, falg; sprintf(szUrl, "http://www.chazidian.com/jinyicidaquan/"); szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5); if(!szData) return NULL; CodeConvert(szData, svData, sizeof(svData), 1); // puts(svData); /* if(fopen("Text.txt", "r+") == NULL) fp=fopen("Text.txt", "w+r"); else fp=fopen("Text.txt", "r+"); fputs(svData, fp);*/ gets(strFrom); while(strstr(svData, strFrom) == NULL)//判断是否在本页,不在的话进入下一页 { p = strstr(svData, "下一页"); q = p-60; memset(szUrl, 0, sizeof(szUrl)); len = 0; while(q++ < p) szUrl[len++]=*q; szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5); CodeConvert(szData, svData, sizeof(svData), 1); //HanziToAnsi(szData, sizeof(szData),svData,sizeof(svData)); } p = strstr(svData, strFrom);//找到起点 falg = 0;//标记是前词还是后词 if(*(p-1)=='/') { q = p - 1; } else { q = p - 1; while(*q!='/') { q--; } p=q+1; falg = 1;//标记为后词 } while(*q!='"') q--; memset(szUrl, 0, sizeof(szUrl)); len = 0; while(++q < p) szUrl[len++] = *q; if(!falg) { CodeConvert(strFrom, strTo, sizeof(strTo), 2); strcat(szUrl, strTo); } puts(szUrl); szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5); CodeConvert(szData, svData, sizeof(svData), 1);//转码 // HanziToAnsi(szData, sizeof(szData),svData,sizeof(svData)); puts(svData); if(fopen("Text.txt", "r+") == NULL) fp=fopen("Text.txt", "w+r"); else fp=fopen("Text.txt", "r+"); fputs(svData, fp); free(szData); szData = NULL; return 1; }
优化1.2版:
1 #include <stdlib.h> 2 #include <stdio.h> 3 #include <string.h> 4 #include "lyGetHttpResult.h" 5 #include "lyPublic/lyCodeConvert.c" 6 int main() 7 { 8 9 char szUrl[512] = ""; 10 char svData[1024 * 40] = ""; 11 char *szData = NULL; 12 // FILE *fp; 13 // char *p, *q,*q2,*p2; 14 char *p; 15 char strFrom[100] = "", strTo[100] = ""; 16 char findStr[20] = "", andStr[20] = "</span> - ";//查找标记串 17 char outStr[100] = "",reStr[100] = ""; 18 char str[100] = "",str2[100] = ""; 19 int len; 20 while(gets(strFrom)) 21 { //初串 22 memset(str,0,sizeof(str)); 23 memset(reStr,0,sizeof(reStr)); 24 memset(str2,0,sizeof(str2)); 25 memset(findStr,0,sizeof(findStr)); 26 memset(strTo,0,sizeof(strTo)); 27 strcpy(str,"http://www.chazidian.com/jinyici/"); 28 strcpy(reStr,strFrom); 29 CodeConvert(strFrom, str2, sizeof(str2), 2);//先将汉字GBK转为UTF-8再接道网址后面 30 strcat(str,str2); 31 32 sprintf(szUrl, str); 33 szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5); 34 if(!szData) 35 return NULL; 36 37 CodeConvert(szData, svData, sizeof(svData), 1);//找汉字的时候是找GBK。,所以还要转回来 38 /* if(fopen("Text.txt", "r+") == NULL) 39 fp=fopen("Text.txt", "w+r"); 40 else 41 fp=fopen("Text.txt", "r+"); 42 fputs(svData, fp);*/ 43 strcpy(findStr,strFrom);// 44 strcat(findStr,andStr); 45 p = strstr(svData, findStr); 46 len = strlen(outStr); 47 while(*p != '\n') ///有雨原网页的特点,设置为遇到回车结束 48 { 49 if(*p != '<' && (*p < 'a'||*p > 'z') && *p != '/' && *p != '>' && *p != '-') 50 { 51 outStr[len++] = *p; 52 } 53 p++; 54 } 55 puts(outStr); 56 57 p = strstr(outStr,reStr);//去重 58 len = strlen(reStr); 59 p+=len+2; 60 printf("%s\n",p); 61 memset(strFrom,0,sizeof(strFrom)); 62 memset(outStr,0,sizeof(outStr)); 63 free(szData); 64 szData = NULL; 65 } 66 return 1; 67 }