requests 页面级爬虫习惯用法 及 实例

util.py文件

import requests

def get(url, params=None, cookie=None, headers=None, proxies=None):
    '''
    此方法用于发起get请求
    :param url:
    :param params:
    :param cookie:
    :param headers:
    :param proxies:
    :return:
    '''
    s = requests.session()
    try:
        if params != None:
            s.params = params
        if cookie != None:
            s.cookies = cookie
        if headers != None:
            s.headers = headers
        if proxies != None:
            s.proxies = proxies
        r = s.get(url=url, timeout=20)
        return (1, r.content)
    except Exception as e:
        print(e)
    finally:
        if s:
            s.close()
    return (0,)


def post(url, data, params=None, cookie=None, headers=None, proxies=None):
    '''
    此方法用于发起post请求
    :param url:
    :param params:
    :param cookie:
    :param headers:
    :param proxies:
    :return:
    '''
    s = requests.session()
    try:
        if params != None:
            s.params = params
        if cookie != None:
            s.cookies = cookie
        if headers != None:
            s.headers = headers
        if proxies != None:
            s.proxies = proxies
        r = s.post(url=url, data=data, timeout=20)
        return (1, r.content,r.cookies)
    except Exception as e:
        print(e)
    finally:
        if s:
            s.close()
    return (0,)

爬取实例 (guazi):

from   xxx   import util
import re
from lxml import etree
from bs4 import BeautifulSoup

head = {"Cookie": "antipas=2W192MJ893976a23019W485050817",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cache-Control": "max-age=0",
        "Connection": "keep-alive",
        "Host": "www.guazi.com",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}

r = util.get("https://www.guazi.com/sjz/dazhong/", headers=head)
if r[0] == 1:
    body = r[1].decode()

#bs4 做法

 soup = BeautifulSoup(body, 'lxml')
    all=soup.find_all('a', class_="car-a")
    for item in all:
        href=item.attrs["href"]
        title=item.attrs["title"]
        gl=item.contents[7].text.split('|')[1]
        price=item.contents[9].contents[3].text

xpath做法

 html=etree.HTML(body)
    # data=html.xpath('//ul[@class="carlist clearfix js-top"]/li')
    # for item in data:
    #     href=item.xpath('a/@href')[0]
    #     title=item.xpath('a/@title')[0]
    #     gl=item.xpath('a/div[1]/text()')[1]
    #     price=item.xpath('a/div[2]/p/text()')[0]

正则匹配

body = r[1].decode().replace("\n", "").replace("\r", "").replace("\t", "")
    # com=re.compile('
  • (.*?).*?(.*?)
  • .*?

    (.*?)

    ') # data=com.findall(body) # for item in data: # print(item[0]) # print(item[1]) # print(item[2]) # print(item[3])

    你可能感兴趣的:(爬虫)