在Python中使用Xpath进行数据爬取的案例

#!/usr/bin/evn python

# -*- coding:utf-8 -*-

import urllib2

import urllib

from lxml import etree

class proxyObj:

proxyService="";

proxyPort=""

proxyHttp="http"

proxy_headers={

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36"

};

 

url="http://www.xicidaili.com/nn/"

proxy_test_url="http://www.baidu.com/"

proxyLists=[]

enableProxy=[]

disableProxy=[]

for i in range(0,3):

page=(i+1);

tempurl=url+str(page);

print(tempurl)

request=urllib2.Request(url=tempurl,headers=proxy_headers);

response=urllib2.urlopen(request);

if response.getcode()==200:

htmlcontent=response.read();

htmlobj=etree.HTML(htmlcontent);

# 根据使用xpath的匹配规则把table下面的tr全部拿出来

htmldata=htmlobj.xpath("//table[@id='ip_list']//tr");

print("htmldata--size:"+str(len(htmldata)))

# 从拿到的行里,分别拿到ip集合,端口集合,协议集合。

ips=htmldata[0].xpath("//td[2]//text()");

ports=htmldata[0].xpath("//td[3]//text()");

https=htmldata[0].xpath("//td[6]//text()");

for i in range(0,len(ips)):

proxy=proxyObj()

proxy.proxyService=ips[i]

proxy.proxyPort=ports[i]

proxy.proxyHttp=https[i]

proxyLists.append(proxy);

def proxyTest(proxys):

if type(proxys) is list and len(proxys)>0:

for proxy in proxys:

proxyInfo={proxy.proxyHttp:proxy.proxyService+":"+proxy.proxyPort}

#构造代理的Handler

proxyHandler=urllib2.ProxyHandler(proxyInfo)

#构造代理的opener

opener=urllib2.build_opener(proxyHandler)

#构造一个测试的Request

request=urllib2.Request(url=proxy_test_url,headers=proxy_headers);

try:

#发送请求,并设置语法的超时时间

response = opener.open(request,timeout=10);

if response.getcode()==200:

enableProxy.append(proxy)

else:

disableProxy.append(proxy)

except:

disableProxy.append(proxy)

continue;

print("************start************************")

print("抓取的代理总数有:"+str(len(proxyLists)))

print("准备进入测代理测试程式............")

proxyTest(proxyLists);

print("测代理测试程式完成............")

print("正在生成测代理报告............")

print("测代理报告:............")

print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

print("+ 抓取的代理总数有:+"+str(len(proxyLists))+"++++++++++++++++++++++++++++++++++++++")

print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

print("+ 抓取的代理可用数有:+"+str(len(enableProxy))+"++++++++++++++++++++++++++++++++++++++")

print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

print("+ 抓取的代理不可用数有:+"+str(len(disableProxy))+"++++++++++++++++++++++++++++++++++++++")

print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

print("************start************************")

with open("proxy.txt","a+") as f:

for obj in enableProxy:

f.write(""+obj.proxyService+" "+obj.proxyPort+" "+obj.proxyHttp)

你可能感兴趣的:(python)