使用BS4爬取智联招聘

代码使用了try,except来排除异常

随机选取代理IP+sleep15秒,模拟人类点击,以避开反爬虫机制

# coding=utf-8
from bs4 import BeautifulSoup
import requests
import time
import random
import sys
import pandas
import MySQLdb

def getpage():
    pg=1

    h1={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    o_g=['222.33.192.238:8118','121.61.17.36:8118','113.200.214.164:9999','222.33.192.238:8118']

    a=random.randint(0,4)
    pro={'http': o_g[a]}

    url='http://sou.zhaopin.com/jobs/searchresult.ashx?in=180000&pd=30&jl=%E4%B8%8A%E6%B5%B7&kw=%E5%9F%BA%E9%87%91&sm=0&sf=0&el=4&isfilter=0&fl=538&isadv=1&sb=1&p='+ str(pg)
    while pg <10:
        try:
            html=requests.get(url,timeout=20,headers=h1,proxies=pro)
            html.encoding = "utf-8"
            html = html.text
            print '抓到'
            re(html)

        except :       #锁定是哪种异常
            xx=raw_input('again?')
            if xx=='yes':
                pass
            else:
                print 'ERROR INPUT !'
        print('翻页等15秒钟')
        time.sleep(15)
        pg=pg+1



def re(html):
    try:
        l1=[]
        l2=[]
        l3=[]
        soup = BeautifulSoup(html,'lxml')
        con=soup.find_all('a',style="font-weight: bold")
        for item in con:
            l1.append(item.get_text())
            l2.append(item.attrs['href'])
        con2=soup.find_all('li',class_="newlist_deatil_two")
        for item2 in con2:
            l3.append(item2)
        print l1
        print l2
        print l3

        for j in range(0, 59):

            conn= MySQLdb.connect(
            host='localhost',
            port = 3306,
            user='root',
            passwd='******',
            db ='zlzp',
            charset='utf8'
            )
            cur = conn.cursor()

            cur.execute("insert into zlzp VALUES (NULL,'%s','%s','%s')"%(l1[j],l2[j],l3[j]))

            cur.close()
            conn.commit()
            conn.close()
        print("成功")
    except:
        print("重新解析")
        re(html)

getpage()

你可能感兴趣的:(爬虫,爬虫,PYTHON)