util.py文件
import requests
def get(url, params=None, cookie=None, headers=None, proxies=None):
'''
此方法用于发起get请求
:param url:
:param params:
:param cookie:
:param headers:
:param proxies:
:return:
'''
s = requests.session()
try:
if params != None:
s.params = params
if cookie != None:
s.cookies = cookie
if headers != None:
s.headers = headers
if proxies != None:
s.proxies = proxies
r = s.get(url=url, timeout=20)
return (1, r.content)
except Exception as e:
print(e)
finally:
if s:
s.close()
return (0,)
def post(url, data, params=None, cookie=None, headers=None, proxies=None):
'''
此方法用于发起post请求
:param url:
:param params:
:param cookie:
:param headers:
:param proxies:
:return:
'''
s = requests.session()
try:
if params != None:
s.params = params
if cookie != None:
s.cookies = cookie
if headers != None:
s.headers = headers
if proxies != None:
s.proxies = proxies
r = s.post(url=url, data=data, timeout=20)
return (1, r.content,r.cookies)
except Exception as e:
print(e)
finally:
if s:
s.close()
return (0,)
爬取实例 (guazi):
from xxx import util
import re
from lxml import etree
from bs4 import BeautifulSoup
head = {"Cookie": "antipas=2W192MJ893976a23019W485050817",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Host": "www.guazi.com",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
r = util.get("https://www.guazi.com/sjz/dazhong/", headers=head)
if r[0] == 1:
body = r[1].decode()
#bs4 做法
soup = BeautifulSoup(body, 'lxml')
all=soup.find_all('a', class_="car-a")
for item in all:
href=item.attrs["href"]
title=item.attrs["title"]
gl=item.contents[7].text.split('|')[1]
price=item.contents[9].contents[3].text
xpath做法
html=etree.HTML(body)
# data=html.xpath('//ul[@class="carlist clearfix js-top"]/li')
# for item in data:
# href=item.xpath('a/@href')[0]
# title=item.xpath('a/@title')[0]
# gl=item.xpath('a/div[1]/text()')[1]
# price=item.xpath('a/div[2]/p/text()')[0]
正则匹配
body = r[1].decode().replace("\n", "").replace("\r", "").replace("\t", "")
# com=re.compile('(.*?).*?(.*?)
(.*?)
') # data=com.findall(body) # for item in data: # print(item[0]) # print(item[1]) # print(item[2]) # print(item[3])