最近又重新拾起了久违的爬虫,写了一个代理ip的爬取,验证和存储器。
1.爬取网站是西刺代理,使用了requests+beautifulsoup库
2.验证的网站使用了京东和淘宝的首页,用了urllib+beautifulsoup库
3.将爬取后的代码存入本地的数据库中,这里使用的是sql server 2008,用的是pyodbc库
4.验证的时候开了20个线程,用了python里的threading库
5.定期从库中拿出代理ip,将失效的ip删除
爬取代码:
# -*- coding: utf-8 -*-
import time
import pyodbc
import requests
import urllib
import threading
import socket
import sys
import csv
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding("utf-8")
target_url = []
aim_ip = []
for i in range(1, 2):
url = 'http://www.xicidaili.com/nn/%d' %i
target_url.append(url)
all_message = []
class ipGet(threading.Thread):
def __init__(self, target):
threading.Thread.__init__(self)
self.target = target
def Get_ip(self):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
html = requests.get(self.target, headers=headers)
soup = BeautifulSoup(html.text)
trs = soup.find('table', id='ip_list').find_all('tr')
for tr in trs[1:]:
tds = tr.find_all('td')
ip = tds[1].text.strip()
opening = tds[2].text.strip()
message = [ip, opening]
all_message.append(message)
# print ip, opening
def run(self):
self.Get_ip()
class ipCheck(threading.Thread):
def __init__(self, ipList):
threading.Thread.__init__(self)
self.ipList = ipList
self.timeout = 6
self.test_url = 'http://www.jd.com/?cu=true&utm_source=click.linktech.cn&utm_medium=tuiguang&utm_campaign=t_4_A100220955&utm_term=7e7c13a102664ab3a6886ccefa66d930&abt=3'
self.another_url = 'https://www.taobao.com/'
def Check_ip(self):
socket.setdefaulttimeout(3)
for ip in self.ipList:
try:
proxy_host = "http://"+ip[0]+":"+ip[1]
proxy_temp = {"http":proxy_host}
t_start = time.time()
res = urllib.urlopen(self.test_url, proxies=proxy_temp).read()
res2 = urllib.urlopen(self.another_url, proxies=proxy_temp).read()
t_use = time.time() - t_start
soup = BeautifulSoup(res)
soup2 = BeautifulSoup(res2)
ans = soup.find('link', rel='dns-prefetch')
ans2 = soup2.find('link', rel='dns-prefetch')
if ans != None and ans2 != None:
aim_ip.append((ip[0], ip[1], t_use))
else:
continue
except Exception, e:
print e
def run(self):
self.Check_ip()
class save_csv():
def __init__(self, SaveList):
self.driver = '{SQL Server}'
self.server = '(local)'
self.database = 'ip_save'
self.savelist = SaveList
def Save_ip(self):
base = pyodbc.connect(DRIVER = self.driver, SERVER = self.server, DATABASE = self.database)
source = base.cursor()
counts = 0
for each in self.savelist:
aim = source.execute("select * from ip where ips='%s'"%each[0])
if aim.fetchone() == None:
source.execute("Insert into ip values('%s','%s','%s')"%(each[0], each[1], each[2]))
else:
print "The ip: '%s' is exist!"%each[0]
counts += 1
base.commit()
source.close()
base.close()
return counts
if __name__ == '__main__':
GetThreading = []
CheckThreading = []
for i in range(len(target_url)):
t = ipGet(target_url[i])
GetThreading.append(t)
for i in range(len(GetThreading)):
GetThreading[i].start()
print GetThreading[i].is_alive()
for i in range(len(GetThreading)):
GetThreading[i].join()
print '@' * 3 + ' ' * 2 + "总共抓取了%s个代理" % len(all_message) + ' ' * 2 + '@' * 3
for i in range(20):
t = ipCheck(all_message[((len(all_message)+19)/20)*i:((len(all_message)+19)/20)*(i+1)])
CheckThreading.append(t)
for i in range(len(CheckThreading)):
CheckThreading[i].start()
print CheckThreading[i].is_alive()
for i in range(len(CheckThreading)):
CheckThreading[i].join()
print '@' * 3 + ' ' * 2 + "总共有%s个代理通过校验" % len(aim_ip) + ' ' * 2 + '@' * 3
t = save_csv(aim_ip)
counts = t.Save_ip()
print '@' * 3 + ' ' * 2 + "总共新增%s个代理" % (len(aim_ip)-counts) + ' ' * 2 + '@' * 3
# -*- coding: utf-8 -*-
import pyodbc
import threading
import socket
import urllib
import time
from bs4 import BeautifulSoup
class Get_ip_sql():
def __init__(self):
self.driver = '{SQL Server}'
self.server = '(local)'
self.database = 'ip_save'
def Get(self):
base = pyodbc.connect(DRIVER=self.driver, SERVER=self.server, DATABASE=self.database)
source = base.cursor()
CheckList = source.execute("Select * from ip")
CheckList = list(CheckList)
counts = source.execute("Select count(*) from ip")
row = counts.fetchone()
return CheckList, row[0]
class Check_ip_intime(threading.Thread):
def __init__(self, CheckList):
threading.Thread.__init__(self)
self.checklist = CheckList
self.driver = '{SQL Server}'
self.server = '(local)'
self.database = 'ip_save'
self.test_url = 'http://www.jd.com/?cu=true&utm_source=click.linktech.cn&utm_medium=tuiguang&utm_campaign=t_4_A100220955&utm_term=7e7c13a102664ab3a6886ccefa66d930&abt=3'
self.another_url = 'https://www.taobao.com/'
def Work(self):
base = pyodbc.connect(DRIVER=self.driver, SERVER=self.server, DATABASE=self.database)
source = base.cursor()
socket.setdefaulttimeout(3)
for each in self.checklist:
try:
proxy_host = "http://"+each[0]+":"+bytes(each[1])
proxy_temp = {'http':proxy_host}
t_start = time.time()
res = urllib.urlopen(self.test_url, proxies=proxy_temp).read()
res2 = urllib.urlopen(self.another_url, proxies=proxy_temp).read()
t_use = time.time() - t_start
t_use = bytes(t_use)
soup = BeautifulSoup(res)
soup2 = BeautifulSoup(res2)
ans = soup.find('link', rel='dns-prefetch')
ans2 = soup2.find('link', rel='dns-prefetch')
if ans == None or ans2 == None:
source.execute("Delete from ip where ips = '%s'"%(each[0]))
else:
source.execute("Update ip set time_used = '%s' where ips = '%s'"%(t_use, each[0]))
print each[0]
except Exception, e:
source.execute("Delete from ip where ips = '%s'"%(each[0]))
print e
base.commit()
def run(self):
self.Work()
class Count_ip():
def __init__(self):
self.driver = '{SQL Server}'
self.server = '(local)'
self.database = 'ip_save'
def Compute(self):
base = pyodbc.connect(DRIVER=self.driver, SERVER=self.server, DATABASE=self.database)
source = base.cursor()
col = source.execute("Select count(*) from ip")
ans = col.fetchone()
return ans[0]
if __name__ == '__main__':
t = Get_ip_sql()
Check, counts= t.Get()
CheckThreading = []
points = 0
for i in range(5):
t = Check_ip_intime(Check[((counts + 4) / 5) * i:((counts + 4) / 5) * (i + 1)])
CheckThreading.append(t)
for i in range(len(CheckThreading)):
CheckThreading[i].start()
print CheckThreading[i].is_alive()
for i in range(len(CheckThreading)):
CheckThreading[i].join()
c = Count_ip()
ans = c.Compute()
print '@' * 3 + ' ' * 2 + "总共删除了%s个失效代理" %(counts - ans) + ' ' * 2 + '@' * 3
print '@' * 3 + ' ' * 2 + "剩余%s个代理" % ans + ' ' * 2 + '@' * 3