代码使用了try,except来排除异常
随机选取代理IP+sleep15秒,模拟人类点击,以避开反爬虫机制
# coding=utf-8
from bs4 import BeautifulSoup
import requests
import time
import random
import sys
import pandas
import MySQLdb
def getpage():
pg=1
h1={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
o_g=['222.33.192.238:8118','121.61.17.36:8118','113.200.214.164:9999','222.33.192.238:8118']
a=random.randint(0,4)
pro={'http': o_g[a]}
url='http://sou.zhaopin.com/jobs/searchresult.ashx?in=180000&pd=30&jl=%E4%B8%8A%E6%B5%B7&kw=%E5%9F%BA%E9%87%91&sm=0&sf=0&el=4&isfilter=0&fl=538&isadv=1&sb=1&p='+ str(pg)
while pg <10:
try:
html=requests.get(url,timeout=20,headers=h1,proxies=pro)
html.encoding = "utf-8"
html = html.text
print '抓到'
re(html)
except : #锁定是哪种异常
xx=raw_input('again?')
if xx=='yes':
pass
else:
print 'ERROR INPUT !'
print('翻页等15秒钟')
time.sleep(15)
pg=pg+1
def re(html):
try:
l1=[]
l2=[]
l3=[]
soup = BeautifulSoup(html,'lxml')
con=soup.find_all('a',style="font-weight: bold")
for item in con:
l1.append(item.get_text())
l2.append(item.attrs['href'])
con2=soup.find_all('li',class_="newlist_deatil_two")
for item2 in con2:
l3.append(item2)
print l1
print l2
print l3
for j in range(0, 59):
conn= MySQLdb.connect(
host='localhost',
port = 3306,
user='root',
passwd='******',
db ='zlzp',
charset='utf8'
)
cur = conn.cursor()
cur.execute("insert into zlzp VALUES (NULL,'%s','%s','%s')"%(l1[j],l2[j],l3[j]))
cur.close()
conn.commit()
conn.close()
print("成功")
except:
print("重新解析")
re(html)
getpage()