爬虫简单语法

第一种:正则表达式

import requests

params = {"wd": "杨幂"}

url = "https://www.baidu.com/s?"

headers = {
    'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
}

response = requests.get(url=url, headers=headers, params=params)

# 请求头
print(response.request.headers)

# 返回编码格式
print(response.encoding)
response = response.content

第二种:xpath ;

1.先安装:
pip3 install lxml
2.导入包:

import requests
from lxml import etree
import json


class QiushiSpider():
    def __init__(self):
        self.pg = 1
        self.url = "https://www.qiushibaike.com/text/"
        self.headers = {
            "User-Agent": 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
        }

    def locapage(self, url):
        response = requests.get(url=url, headers=self.headers)
        html = response.content.decode('utf-8')
        content = etree.HTML(html)
        parent = content.xpath('//div[@class="col1"]/div')

        for son in parent:
            name = son.xpath(".//h2/text()")[0]
            contents = '.'.join(son.xpath('.//div[@class="content"]//span/text()'))
            smile = son.xpath('.//div[@class="stats"]/span/i/text()')[0]
            comments = son.xpath('.//div[@class="stats"]/span/a/i/text()')[0]

            dict = {
                'name': name.strip(),
                'content': contents.strip(),
                'smile': smile,
                'comments': comments,
            }
            self.save(dict)

            # if self.pg < int(page):
            #     self.pg += 1
            #     self.start()

    def save(self, dict):
        with open("qiushi.json", "a") as f:
            f.write(json.dumps(dict, ensure_ascii=False) + "\n")

    def start(self):
        full_url = self.url + str(self.pg) + "/"
        self.locapage(full_url)


if __name__ == "__main__":
    qs = QiushiSpider()
    qs.start()

第三种:bs4(BeatifulSoup),使用选择器获取节点对象 ;

import requests
from bs4 import BeautifulSoup
import json

def locapage():
    url = 'https://hr.tencent.com/position.php?&start=0'

    headers = {
        "User-Agent": 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
    }
    response = requests.get(url=url, headers=headers)

    html = response.content.decode('utf-8')

    bs = BeautifulSoup(html, 'lxml')

    tr = bs.select('tr[class="even"]')
    tr1 = bs.select('.odd')

    # tr = bs.find_all('tr', class_="even")
    # tr1 = bs.find_all('tr', class_="odd")

    result = tr + tr1

    for job in result:
        name = job.select('td a')[0].get_text()
        type = job.select('td')[1].get_text()
        num = job.select('td')[2].get_text()
        address = job.select('td')[3].get_text()
        time = job.select('td')[4].get_text()

        dict = {
            "name": name,
            "type": type,
            "num": num,
            "address": address,
            "time": time,
        }
        save(dict)


def save(content):
    with open('job.json', 'a') as f:
        f.write(json.dumps(content, ensure_ascii=False) + "\n")


if __name__ == '__main__':
    locapage()

你可能感兴趣的:(爬虫简单语法)