网页抓取总结(一)

今天在公司学了网页抓取,感觉在学校C就学了个皮毛,到了公司啥都不懂。做个简单的总结

1、建工程不在一个文件夹,调用函数时,写的头文件要带路径,如#include "lyPublic/lyCodeConvert.c"

2、窗口事件要修改 工程-设置-连接的“/subsystem:console /incremental:yes” ,改为“/subsystem:windows /incremental:yes”

3、在抓取网页时,传递的网址,UTF-8和GBK之间的转换。要将GBK转为UTF-8后再打开,不然会丢失关键词

今天成果:

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include "lyGetHttpResult.h"

#include "lyPublic/lyCodeConvert.c"

int main()

{

    char szUrl[512] = "";

    char svData[1024 * 40] = "";

    char *szData = NULL;

    FILE *fp;

    char *p, *q;

    char strFrom[100]="",strTo[100]="";

    int len, falg;

    sprintf(szUrl, "http://www.chazidian.com/jinyicidaquan/");

    szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);

    if(!szData)

        return NULL;



    CodeConvert(szData, svData, sizeof(svData), 1);

//    puts(svData);

/*    if(fopen("Text.txt", "r+") == NULL)

        fp=fopen("Text.txt", "w+r");

    else

        fp=fopen("Text.txt", "r+");

    fputs(svData, fp);*/

    gets(strFrom);

    while(strstr(svData, strFrom) == NULL)//判断是否在本页,不在的话进入下一页

    {

        p = strstr(svData, "下一页");    

        q = p-60;

        memset(szUrl, 0, sizeof(szUrl));

        len = 0;

        while(q++ < p)

            szUrl[len++]=*q;

        szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);

        CodeConvert(szData, svData, sizeof(svData), 1);

        //HanziToAnsi(szData, sizeof(szData),svData,sizeof(svData));

    }



    p = strstr(svData, strFrom);//找到起点

    falg = 0;//标记是前词还是后词

    if(*(p-1)=='/')

    {

        q = p - 1;

    }

    else

    {

        q = p - 1;

        while(*q!='/')

        {

            q--;

        }

        p=q+1;

        falg = 1;//标记为后词

    }



    while(*q!='"')

        q--;



    memset(szUrl, 0, sizeof(szUrl));

    len = 0;

    while(++q < p)

        szUrl[len++] = *q;

    if(!falg)

    {

        CodeConvert(strFrom, strTo, sizeof(strTo), 2);

        strcat(szUrl, strTo);

    }

    puts(szUrl);

    szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);

    CodeConvert(szData, svData, sizeof(svData), 1);//转码

//    HanziToAnsi(szData, sizeof(szData),svData,sizeof(svData));

    puts(svData);



    if(fopen("Text.txt", "r+") == NULL)

        fp=fopen("Text.txt", "w+r");

    else

        fp=fopen("Text.txt", "r+");

    fputs(svData, fp);





    free(szData);

    szData = NULL;

    return 1;

}

 优化1.2版:

 1 #include <stdlib.h>

 2 #include <stdio.h>

 3 #include <string.h>

 4 #include "lyGetHttpResult.h"

 5 #include "lyPublic/lyCodeConvert.c"

 6 int main()

 7 {

 8 

 9     char szUrl[512] = "";

10     char svData[1024 * 40] = "";

11     char *szData = NULL;

12 //    FILE *fp;

13 //    char *p, *q,*q2,*p2;

14     char *p;

15     char strFrom[100] = "", strTo[100] = "";

16     char findStr[20] = "", andStr[20] = "</span> - ";//查找标记串

17     char outStr[100] = "",reStr[100] = "";

18     char str[100] = "",str2[100] = "";

19     int len;

20     while(gets(strFrom))

21     {                //初串

22         memset(str,0,sizeof(str));

23         memset(reStr,0,sizeof(reStr));

24         memset(str2,0,sizeof(str2));

25         memset(findStr,0,sizeof(findStr));

26         memset(strTo,0,sizeof(strTo));

27         strcpy(str,"http://www.chazidian.com/jinyici/");

28         strcpy(reStr,strFrom);

29         CodeConvert(strFrom, str2, sizeof(str2), 2);//先将汉字GBK转为UTF-8再接道网址后面

30         strcat(str,str2);

31 

32         sprintf(szUrl, str);

33         szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);

34         if(!szData)

35             return NULL;

36 

37         CodeConvert(szData, svData, sizeof(svData), 1);//找汉字的时候是找GBK。,所以还要转回来

38         /*    if(fopen("Text.txt", "r+") == NULL)

39                     fp=fopen("Text.txt", "w+r");

40                 else

41                     fp=fopen("Text.txt", "r+");

42                 fputs(svData, fp);*/

43         strcpy(findStr,strFrom);//

44         strcat(findStr,andStr);

45         p = strstr(svData, findStr);

46         len = strlen(outStr);

47         while(*p != '\n') ///有雨原网页的特点,设置为遇到回车结束

48         {

49             if(*p != '<' && (*p < 'a'||*p > 'z') && *p != '/' && *p != '>' && *p != '-')

50             {

51                 outStr[len++] = *p;

52             }

53             p++;

54         }

55         puts(outStr);

56 

57         p = strstr(outStr,reStr);//去重

58         len = strlen(reStr);

59         p+=len+2;

60         printf("%s\n",p);

61         memset(strFrom,0,sizeof(strFrom));

62         memset(outStr,0,sizeof(outStr));

63         free(szData);

64         szData = NULL;

65     }

66     return 1;

67 }
View Code

 

你可能感兴趣的:(网页抓取)