爬虫代码集

一、简单爬取猫眼(使用字符串)

import json

import requests
import re


# 返回页面
def get_page(url):
    headers = {
        "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.content.decode('utf-8')
    return None


# 选择抓取内容
def parse_page(html):
    # 主演
    pattern = re.compile('

(.*?)

', re.S) actor_items = re.findall(pattern, html) # 电影名 pattern = re.compile('movieId.*?>.*?(.*?)

',re.S) time_items = re.findall(pattern,html) # 排名 pattern = re.compile('(.*?)', re.S) rank_items = re.findall(pattern, html) # 图片链接 # pattern = re.compile('movieId.*?>.*?

二、简单爬取豆瓣(使用XPATH)

import requests
from lxml import etree


# 取页面HTML


def get_one_page():
    url = "https://www.douban.com/group/explore/culture"
    headers = {
        "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        text = response.content.decode('utf-8')
        return text
    return None


# 解析页面
def parse_with_xpath(html):
    etree_html = etree.HTML(html)
    # print(etree_html)

    # 匹配所有节点 //*
    # result = etree_html.xpath('//*')
    # print(result)
    # print(len(result))

    # 匹配所有子节点 //a     文本获取:text()
    # result = etree_html.xpath('//a/text()')
    # print(result)

    # 查找元素子节点 /
    # result = etree_html.xpath('//div/p/text()')
    # print(result)

    # 查找元素所有子孙节点 //
    # result = etree_html.xpath('//div[@class="channel-item"]//h3/a/text()')
    # print(result)

    # 父节点 ..
    # result = etree_html.xpath('//span[@class="pubtime"]/../span/a/text()')
    # print(result)

    # 属性匹配 [@class="xxx"]
    # 文本匹配 text() 获取所有文本//text()
    result = etree_html.xpath('//div[@class="article"]//text()')
    print(result)

    # 属性获取 @href
    # result = etree_html.xpath('//div[@class="article"]/div/div/@class')[0]
    # # result = etree_html.xpath('//div[@class="bd"]/h3/a/@href')
    # print(result)

    # 属性多值匹配 contains(@class 'xx')
    # result = etree_html.xpath('//div[contains(@class, "grid-16-8")]//div[@class="likes"]/text()[1]')
    # print(result)

    # 多属性匹配 or, and, mod, //book | //cd, + - * div = != < > <= >=
    # result = etree_html.xpath('//span[@class="pubtime" and contains(text(), "10-18")]/text()')
    # print(result)

    # 按序选择 [1] [last()] [poistion() < 3] [last() -2]
    # 节点轴
    # //li/ancestor::*  所有祖先节点
    # //li/ancestor::div div这个祖先节点
    # //li/attribute::* attribute轴,获取li节点所有属性值
    # //li/child::a[@href="link1.html"]  child轴,获取直接子节点
    # //li/descendant::span 获取所有span类型的子孙节点
    # //li/following::* 选取文档中当前节点的结束标记之后的所有节点
    # //li/following-sibling::*     选取当前节点之后的所用同级节点

    # result = etree_html.xpath('//li/ancestor::div')
    # print(result)


def main():
    html = get_one_page()
    # print(html)
    parse_with_xpath(html)


if __name__ == '__main__':
    main()


Beautiful Soup爬取

import requests
from bs4 import BeautifulSoup


# 取页面HTML
def get_one_page():
    url = "https://www.zhipin.com/c101270100-p100109/"
    headers = {
        "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        text = response.content.decode('utf-8')
        return text
    return None


def parse_soup(html):
    soup = BeautifulSoup(html, "lxml") # 试⽤lxml解析器构造beautifulsoup
    print(soup.prettify()) # 取⽹⻚缩进格式化输出
    print(soup.title.string) # 取⽹⻚title内容
    print(soup.head)
    print(soup.p)
    # 获取节点的名字
    print(soup.title.name)
    # 获取节点属性
    soup.img.attrs["src"]print(soup.p.attrs)
    print(soup.p.attrs["name"])
    print(soup.p["class"])
    # 获取节点包含的内容
    print(soup.p.string)
    

asdfasdfasdfasdfasdfadsfad

嵌套选择 this is title # soup的节点都为 bs4.element.Tag类型,可以继续选择 print(soup.head.title.string) 关联选择 有些元素没有特征定位,可以先选择有办法定位的,然后以这个节点为 准选择它的⼦节点、⽗节点、兄弟节点等

print(soup.p.contents) # 取p节点下⾯所有⼦节点列表 print(soup.p.descendants) #取p节点所有⼦孙节点 print(soup.a.parent) # 取⽗节点 print(soup.a.parents) # 取所有祖先节点 print(soup.a.next_sibling) # 同级下⼀节点 print(soup.a.previous_sibling) # 同级上⼀节点 print(soup.a.next_siblings) # 同级所有后⾯节点 print(soup.a.previous_siblings) # 同级所有前⾯节点 print(list(soup.a.parents)[0].attrs['class']) ⽅法选择器 根据属性和⽂本进⾏查找
      • jjj
      print(soup.find_all(name="ul")) for ul in soup.find_all(name="ul"): print(ul.find_all(name="li")) for li in ul.find_all(name="li"): print(li.string) soup.find_all(attrs={"id": "list-1"}) css 选择器

      soup.select('.panel .panel_heading') soup.select('ul li') soup.select('#id1 .element') def main(): html = get_one_page() # print(html) parse_soup(html) if __name__ == '__main__': main()

你可能感兴趣的:(爬虫代码集)