下面代码是linux下c正则的一个应用。程序可通过代理privoxy->tor下载代理,默认的代理是127.0.0.1:8118。
所以在编译后运行前,请先保证tor正在运行中,否则无法下载到代理。一般情况下一次能下载到2000个左右的代理。
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <unistd.h>
#include <netdb.h>
#include <regex.h>
#define NM 10
static int proxy_save_count = 0;
static int reg_num;
static unsigned short through_proxy_port = 8118;
static char code[10];
static char *through_proxy_ip = "127.0.0.1";
static regex_t **pregs;
void die(int i, char *message, char *str)
{
if(str != NULL){
printf(message, str);
exit(i);
}else{
printf(message);
exit(i);
}
}
int write_all(int sockfd, char *buf, int *len)
{
int total = 0, bytesleft = *len, n;
while(total < *len) {
n = write(sockfd, buf + total, bytesleft);
if (n == -1)
break;
total += n;
bytesleft -= n;
}
*len = total;
return n == -1 ? -1 : 0;
}
void to_lower(char *p)
{
while(*p){
if(isalpha(*p) && isupper(*p))
*p = tolower(*p);
p++;
}
}
int parse_http_header(int sockfd, long *bytes)
{
char buf[100], key[30], value[70], ch, *pb = buf;
int n, i = 0, j = 0;
memset(key, 0, sizeof(key));
memset(value, 0, sizeof(value));
while((n = read(sockfd, &ch, 1)) == 1){
if(ch == '\n'){
if(i == 0)
break;
i = 0;
}else if(ch != '\r'){
pb[i++] = ch;
}else{
pb[i] = '\0';
//printf("%s\n", buf);
if(j++ == 0){
if(!strstr(buf, "HTTP/1.") || !strstr(buf, "200")){
printf("%s\n", buf);
return -1;
}
}else{
if(sscanf(buf, "%[^:]: %[^\n]", key, value ) == 2){
to_lower(key);
if(strcmp("content-length", key) == 0){
*bytes = atol(value);
}
}
}
}
}
if(n == -1){
fprintf(stderr, "read() error\n");
return -2;
}
return 0;
}
int with_content_length(int sockfd, long bytes, char **recv)
{
char buf[1024];
int n, len = sizeof(buf);
if((*recv = (char *)malloc(bytes+1)) == NULL)
die(-1, "Memory allocation failed!\n", NULL);
memset(*recv, 0, bytes+1);
while((n = read(sockfd, buf, len-1)) > 0){
buf[n] = '\0';
strcat(*recv, buf);
}
if(n == -1){
fprintf(stderr, "read() error\n");
return 1;
}
close(sockfd);
return 0;
}
int without_content_length(int sockfd, char **recv)
{
char buf[1024];
int n, len = sizeof(buf);
int i = 0, total = 0;
while((n = read(sockfd, buf, len-1)) > 0){
total += n;
buf[n] = '\0';
if(i++ == 0){
if((*recv = (char *)malloc(total+1)) == NULL)
die(-1, "Memory allocation failed!\n", NULL);
strcpy(*recv, buf);
}else{
if((*recv = (char *)realloc(*recv, total+1)) == NULL)
die(-1, "Memory reallocation failed!\n", NULL);
strcat(*recv, buf);
}
}
if(n == -1){
fprintf(stderr, "read() error\n");
return 1;
}
close(sockfd);
return 0;
}
int get_http_body(char *send, char **recv)
{
int sockfd, n, i = 0, len = strlen(send);
struct sockaddr_in sa;
struct hostent *he;
long bytes = 0;
if((sockfd = socket(AF_INET, SOCK_STREAM, 0)) == -1){
fprintf(stderr, "socket() falied\n");
return 1;
}
sa.sin_family = AF_INET;
sa.sin_addr.s_addr = inet_addr(through_proxy_ip);
sa.sin_port = htons(through_proxy_port);
if(connect(sockfd, (struct sockaddr *)&sa, sizeof(sa)) == -1){
fprintf(stderr, "connect() failed\n");
return 2;
}
if(write_all(sockfd, send, &len) == -1){
fprintf(stderr, "write() error\n");
return 3;
}
if((parse_http_header(sockfd, &bytes)) < 0)
return 4;
if(bytes > 0){
if(with_content_length(sockfd, bytes, recv))
return 5;
}else{
if(without_content_length(sockfd, recv))
return 6;
}
return 0;
}
int sub_string(int start, int end, char *src, char **dst)
{
int i = start, j = 0;
if((*dst = (char *)malloc(sizeof(char)*(end-start+1))) == NULL)
die(-1, "Memory allocation failed!\n", NULL);
while(i < end)
(*dst)[j++] = src[i++];
(*dst)[j] ='\0';
return 0 ;
}
int get_index(char *p)
{
int i;
for(i = 0; i < 10; i++){
if(code[i] == *p)
break;
}
return i;
}
void get_code(char *p)
{
int i, j;
memset(code, 0, sizeof(code));
for(i = 0; i < 39; i += 4){
j = atoi(p + i + 2);
code[j]= p[i];
}
}
void print_port(char *port, FILE *fp)
{
char ptr[11], *ch = NULL, delims[] = "+";
int i;
memset(ptr, 0, sizeof(ptr));
strcpy(ptr, port);
ch = strtok(ptr, delims);
while(ch != NULL){
i = get_index(ch);
if(i < 10)
fprintf(fp, "%d", i);
else
printf("Can't not decode port\n");
ch = strtok(NULL, delims);
}
fprintf(fp, "\n");
}
int parse_http_body(char *str, FILE *fp, int eflags, int i, int j, int n, int k, int flag)
{
char *ip = NULL, *port = NULL, ch = *str;
regmatch_t pm[NM];
const size_t nm = NM;
if(n > NM){
while(*str && regexec(pregs[k], str, nm, pm, eflags) == 0){
sub_string(pm[i].rm_so, pm[i].rm_eo, str, &ip);
if(flag){
get_code(ip);
*str = 0;
}else{
proxy_save_count++;
fprintf(fp, "%s\n", ip);
str = &str[pm[j].rm_eo];
}
free(ip);
}
if(flag)
*str = ch;
}else{
while(*str && regexec(pregs[k], str, nm, pm, eflags) == 0){
sub_string(pm[i].rm_so, pm[i].rm_eo, str, &ip);
sub_string(pm[n].rm_so, pm[n].rm_eo, str, &port);
if(flag){
proxy_save_count++;
fprintf(fp, "%s:", ip);
print_port(port, fp);
}else{
proxy_save_count++;
fprintf(fp, "%s:%s\n", ip, port);
}
free(ip);
free(port);
str = &str[pm[j].rm_eo];
}
}
return 0;
}
int print_proxys(char *url, int *keys, FILE *fp, int eflags)
{
char host[30], send[512], *data = NULL;
int i, j, n;
memset(host, 0, sizeof(host));
memset(send, 0, sizeof(send));
if(sscanf(url, "http://%[^/]", host) != 1){
printf("Can't not parse url\n");
return 1;
}
sprintf(send, "GET %s HTTP/1.1\r\nHost: %s\r\nUser-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: deflate\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nKeep-Alive: 115\r\nProxy-Connection: keep-alive\r\n\r\n", url, host);
//printf("%s\n", send);
if(get_http_body(send, &data) != 0)
return 2;
if(keys[1] < 0){
if(keys[0] > reg_num-1 || keys[0] < 0)
return 2;
parse_http_body(data, fp, eflags, 0, 2, 11, keys[0], 0);
}else{
if(keys[0] > reg_num-1 || keys[0] < 0 || keys[1] > reg_num-1 || keys[1] < 0)
return 3;
parse_http_body(data, fp, eflags, 0, 1, 11, keys[0], 1);
parse_http_body(data, fp, eflags, 1, 3, 3, keys[1], 1) ;
}
free(data);
return 0;
}
int compile_regexs(char **regexs, int cflags)
{
int i;
if((pregs = (regex_t **)malloc(sizeof(regex_t *)*reg_num)) == NULL)
die(-1, "Memory allocation failed!\n", NULL);
for(i = 0; i < reg_num; i++){
if((pregs[i] = (regex_t *)malloc(sizeof(regex_t))) == NULL)
die(-1, "Memory allocation failed!\n", NULL);
if(regcomp(pregs[i], regexs[i], cflags) != 0){
fprintf(stderr, "regcomp() failed\n");
return (i+1);
}
}
return 0;
}
void free_regs(void)
{
int i;
for(i = 0; i < reg_num; i++)
regfree(pregs[i]);
free(pregs);
}
int main()
{
char *file = "proxy.txt";
FILE *fp = NULL;
int num, i, keys[2];
int eflags = 0, cflags = REG_EXTENDED|REG_NEWLINE;
if((fp = fopen(file, "w")) == NULL)
die(-1, "Failed to create the file %s\n", file);
char *regexs[] = {"([0-9]{1,3}\\.){3}([0-9]{1,3}:[0-9]{1,5})",
"([a-z]=[0-9];){10}",
"(([0-9]{1,3}\\.){3}[0-9]{1,3})[^\\+]+((\\+[a-z]){1,5})",
};
reg_num = sizeof(regexs)/sizeof(char *);
if(compile_regexs(regexs, cflags) != 0)
return 1;
char *urls[] = {"http://www.samair.ru/proxy/type-01.htm",
"http://www.samair.ru/proxy/type-02.htm",
"http://www.samair.ru/proxy/type-03.htm",
"http://www.samair.ru/proxy/type-04.htm",
"http://www.samair.ru/proxy/type-05.htm",
"http://www.samair.ru/proxy/type-06.htm",
"http://www.samair.ru/proxy/type-07.htm",
"http://www.samair.ru/proxy/type-08.htm",
"http://www.samair.ru/proxy/type-09.htm",
"http://www.samair.ru/proxy/type-10.htm",
"http://www.samair.ru/proxy/type-11.htm",
"http://www.samair.ru/proxy/type-12.htm",
"http://www.samair.ru/proxy/type-13.htm",
"http://www.samair.ru/proxy/type-14.htm",
"http://www.samair.ru/proxy/type-15.htm",
"http://www.samair.ru/proxy/type-16.htm",
"http://www.samair.ru/proxy/type-17.htm",
"http://www.samair.ru/proxy/type-18.htm",
"http://www.samair.ru/proxy/type-19.htm",
"http://www.samair.ru/proxy/type-20.htm",
};
num = sizeof(urls)/sizeof(char *);
printf("Proxy downloading is processing...\n");
for(i = 0; i < num; i++){
if(i < 10){
keys[0] = 1; keys[1] = 2;
}else{
keys[0] = 0; keys[1] = -1;
}
print_proxys(urls[i], keys, fp, eflags);
}
if(proxy_save_count > 0)
printf("Now we have downloaded %d proxys, saved in the file %s\n", proxy_save_count, file);
else
printf("Oops, no proxy saved!\n");
fclose(fp);
free_regs();
return 0;
}
来自http://www.innohot.com/?p=19