爬虫网上常见案例代码合集

第一节:爬取百度首页

import requests

r = requests.get('https://www.baidu.com/')
r.encoding = 'utf-8'
print(r.text)

第二节:Requsts+Xpath 爬取豆瓣电影

1、爬取单个元素信息

import requests
from lxml import etree

url = 'https://movie.douban.com/subject/1292052/'  # 《肖申克的救赎》链接

r = requests.get(url).text
s = etree.HTML(r)

file = s.xpath('//*[@id="content"]/h1/span[1]/text()')  # 获取Xpath
print(file)

2、爬取多个元素信息

import requests
from lxml import etree

url = 'https://movie.douban.com/subject/1292052/'
r = requests.get(url).text
s = etree.HTML(r)

movie = s.xpath('//*[@id="content"]/h1/span[1]/text()')
director = s.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')
actor = s.xpath('//*[@id="info"]/span[3]/span[2]/a/text()')
time = s.xpath('//*[@id="info"]/span[13]/text()')

print('电影名称:', movie)
print('导演:', director)
print('主演:', actor)
print('片长:', time)

第三节:爬取豆瓣图书,并将数据存到本地

import requests
from lxml import etree
import time

with open('/Users/mubai888/Desktop/top250.xlsx', 'w', encoding='utf-8') as f:
    for a in range(10):
        url = 'https://book.douban.com/top250?start={}'.format(a*25)
        r = requests.get(url).text

        s = etree.HTML(r)
        file = s.xpath('//*[@id="content"]/div/div[1]/div/table')
        time.sleep(3)

        for div in file:
            title = div.xpath('./tr/td[2]/div[1]/a/@title')[0]
            href = div.xpath('./tr/td[2]/div[1]/a/@href')[0]
            score = div.xpath('./tr/td[2]/div[2]/span[2]/text()')[0]
            num = div.xpath('./tr/td[2]/div[2]/span[3]/text()')[0].strip('(').strip().strip(')').strip()
            scrible = div.xpath('./tr/td[2]/p[2]/span/text()')

            if len(scrible) > 0:
                f.write('{}, {}, {}, {}, {}\n'.format(title, href, score, num, scrible[0]))
            else:
                f.write('{}, {}, {}, {}\n'.format(title, href, score, num))

第四节:爬取小猪短租房屋信息,并将数据保存到本地

from lxml import etree
import requests
import time

with open('/Users/mubai888/Desktop/xiaozhu.xls', 'w', encoding='utf-8') as f:
    for a in range(1, 6):
        url = 'http://cd.xiaozhu.com/search-duanzufang-p{}-0/'.format(a)
        r = requests.get(url).text

        s = etree.HTML(r)
        file = s.xpath('//*[@id="page_list"]/ul/li')
        time.sleep(3)

        for div in file:
            title = div.xpath('./div[2]/div/a/span/text()')[0]
            price = div.xpath('./div[2]/span[1]/i/text()')[0]
            scrible = div.xpath('./div[2]/div/em/text()')[0].strip()
            pic = div.xpath('./a/img/@lazy_src')[0]

            f.write('{} {} {} {}\n'.format(title, price, scrible, pic))

你可能感兴趣的:(实战案例)