网页抓取 (2)

  1 #include <stdlib.h>

  2 #include <stdio.h>

  3 #include <string.h>

  4 #include "lyGetHttpResult.h"

  5 #include "lyPublic/lyCodeConvert.c"

  6 int main()

  7 {

  8 

  9     char szUrl[512] = "";

 10     char svData[1024 * 40] = "";

 11     char *szData = NULL;

 12     FILE *fp;

 13     char *p, *q,*q2,*p2;

 14     char strFrom[100] = "", strTo[100] = "";

 15     char findStr[20] = "", andStr[20] = "</span> - ";//查找标记串

 16     char outStr[100] = "",reStr[100] = "";

 17     char str[100] = "http://www.chazidian.com/jinyici/",str2[100] = "";

 18     int len, falg;

 19 

 20     gets(strFrom);//初串

 21     strcpy(reStr,strFrom);

 22     CodeConvert(strFrom, str2, sizeof(str2), 2);

 23     strcat(str,str2);

 24 

 25     sprintf(szUrl, str);

 26     szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);

 27     if(!szData)

 28         return NULL;

 29 

 30     CodeConvert(szData, svData, sizeof(svData), 1);

 31         if(fopen("Text.txt", "r+") == NULL)

 32                 fp=fopen("Text.txt", "w+r");

 33             else

 34                 fp=fopen("Text.txt", "r+");

 35             fputs(svData, fp);

 36 

 37 /*    while(strstr(svData, strFrom) == NULL)//判断是否在本页,不在的话进入下一页

 38     {

 39         p = strstr(svData, "下一页");    

 40     //    q = p-60;

 41         q=p;

 42         while(*q != ':')

 43             q--;

 44         q+=2;

 45         memset(szUrl, 0, sizeof(szUrl));

 46         len = 0;

 47         while(q++ < p)

 48             szUrl[len++]=*q;

 49         szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);

 50        CodeConvert(szData, svData, sizeof(svData), 1);

 51     }

 52 */

 53     p = strstr(svData, strFrom);//找到起点

 54     falg = 0;

 55     if(*(p-1)=='/')

 56     {

 57         q = p - 1;

 58     }

 59     else

 60     {

 61         q = p - 1;

 62         while(*q!='/')

 63         {

 64             q--;

 65         }

 66         p = q;//

 67         q2 = p;//

 68         p2=p-1;//

 69         while(*(--q2) != '/');

 70         p=q2+1;//后后

 71         memset(strFrom,0,sizeof(strFrom));

 72         while(q2 < p2)

 73             strFrom[falg++] = *(++q2);

 74     }

 75 

 76     while(*q!='"')

 77         q--;

 78 

 79     memset(szUrl, 0, sizeof(szUrl));

 80     len = 0;

 81     while(++q < p)

 82         szUrl[len++] = *q;

 83     strcpy(findStr,strFrom);//

 84     strcat(findStr,andStr);//设置查找串

 85     puts(findStr);

 86     CodeConvert(strFrom, strTo, sizeof(strTo), 2);//关键词转码

 87     strcat(szUrl, strTo);

 88     puts(szUrl);

 89     szData = GetDataFromWeb(szUrl, NULL, NULL,1, 5);

 90     CodeConvert(szData, svData, sizeof(svData), 1);//转码

 91 //    puts(svData);

 92     p = strstr(svData, findStr);

 93     len = strlen(outStr);

 94     while(*p != '4')

 95     {

 96         if(*p != '<' && (*p < 'a'||*p > 'z') && *p != '/' && *p != '>' && *p != '-')

 97         {

 98             outStr[len++] = *p;

 99         }

100         p++;

101     }

102     puts(outStr);

103 

104     p = strstr(outStr,reStr);//去重

105     len = strlen(reStr);

106     if(p == &outStr[0])

107     {

108         p+=len+2;

109         while(*p != '\0')

110             printf("%c",*(p++));

111     }

112     else

113     {

114         q=outStr;

115         while(q != p)

116             printf("%c",*(q++));

117         if(*q == *p)

118             q=p+len;

119         while(*q != '\0')

120             printf("%c",*(q++));

121     }

122     free(szData);

123     szData = NULL;

124 

125 /*    if(fopen("Text.txt", "r+") == NULL)

126         fp=fopen("Text.txt", "w+r");

127     else

128         fp=fopen("Text.txt", "r+");

129     fputs(outStr, fp);*/

130 

131 /*    char strFrom[100] = "";

132     char strTo[100] = "";

133     gets(strFrom);

134     if(CodeGbkToUnicode(strFrom,strTo,100,20))

135         puts(strTo);

136     else

137         printf("NO~!\n");

138 

139 /*    char strFrom[1024*40] = "http://www.chazidian.com/jinyici/", * strTo= NULL;

140     char szData[100] = "",svData[100] = "";

141     char str[1024*40] ;

142     char szUrl[512] = "";

143     FILE *fp;

144     gets(szData);

145     CodeConvert(szData, svData, sizeof(svData), 2);

146     strcat(strFrom,svData);

147     sprintf(szUrl, strFrom);

148     strTo = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);

149     CodeConvert(strTo, str, sizeof(str), 1);

150     if(fopen("Text.txt", "r+") == NULL)

151         fp=fopen("Text.txt", "w+r");

152     else

153         fp=fopen("Text.txt", "r+");

154     fputs(str, fp);    

155     puts(str);*/

156     return 1;

157 }

 早上写的是一页一页抓的,当页数太多时 会变得很慢,后来 老韦说让我写第二个 网站的时候用 网址,但是那个网站用的是内码,不能直接获取信息,可能还要检索他的内码。由此我想前面这个查字典的网站 是不是也可以用 网址去检索,后来改了下,还好可以的,这样就变得快多啦

改后的代码:

 1 #include <stdlib.h>

 2 #include <stdio.h>

 3 #include <string.h>

 4 #include "lyGetHttpResult.h"

 5 #include "lyPublic/lyCodeConvert.c"

 6 int main()

 7 {

 8 

 9     char szUrl[512] = "";

10     char svData[1024 * 40] = "";

11     char *szData = NULL;

12 //    FILE *fp;

13 //    char *p, *q,*q2,*p2;

14     char *p,*q;

15     char strFrom[100] = "", strTo[100] = "";

16     char findStr[20] = "", andStr[20] = "</span> - ";//查找标记串

17     char outStr[100] = "",reStr[100] = "";

18     char str[100] = "http://www.chazidian.com/jinyici/",str2[100] = "";

19     int len;

20 

21     gets(strFrom);//初串

22     strcpy(reStr,strFrom);

23     CodeConvert(strFrom, str2, sizeof(str2), 2);//先将汉字GBK转为UTF-8再接道网址后面

24     strcat(str,str2);

25 

26     sprintf(szUrl, str);

27     szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);

28     if(!szData)

29         return NULL;

30 

31     CodeConvert(szData, svData, sizeof(svData), 1);//找汉字的时候是找GBK。,所以还要转回来

32     /*    if(fopen("Text.txt", "r+") == NULL)

33                 fp=fopen("Text.txt", "w+r");

34             else

35                 fp=fopen("Text.txt", "r+");

36             fputs(svData, fp);*/

37     strcpy(findStr,strFrom);//

38     strcat(findStr,andStr);

39     p = strstr(svData, findStr);

40     len = strlen(outStr);

41     while(*p != '4')

42     {

43         if(*p != '<' && (*p < 'a'||*p > 'z') && *p != '/' && *p != '>' && *p != '-')

44         {

45             outStr[len++] = *p;

46         }

47         p++;

48     }

49     puts(outStr);

50 

51     p = strstr(outStr,reStr);//去重

52     len = strlen(reStr);

53     if(p == &outStr[0])

54     {

55         p+=len+2;

56         while(*p != '\0')

57             printf("%c",*(p++));

58     }

59     else

60     {

61         q=outStr;

62         while(q != p)

63             printf("%c",*(q++));

64         if(*q == *p)

65             q=p+len;

66         while(*q != '\0')

67             printf("%c",*(q++));

68     }

69     free(szData);

70     szData = NULL;

71     return 1;

72 }

 

你可能感兴趣的:(网页抓取)