爬虫实用项目

爬虫项目学习

煎蛋网标题

import requests
from lxml import etree

url = 'http://jandan.net/'

headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}
resp = requests.get(url,headers = headers)
print(resp.status_code)
if resp.status_code == 200:
    html = resp.text
    # print(html)
    dom = etree.HTML(html)
    xpath_pattern = '//div[@class="post f list-post"]/div[@class="indexs"]/h2/a/text()'
    titles = dom.xpath(xpath_pattern)
    print('titles',titles)

    for t in titles:
        print(t)

网易新闻头部爬虫

import requests
from lxml import etree

url = 'https://news.163.com/'

# 新闻网站一般没有饭爬虫机制,可以不用伪造headers
resp = requests.get(url)
if resp.status_code == 200:
    html = resp.text
    print(html)
    # 解析html
    dom = etree.HTML(html)
    xpath_pattern = '//ul[@class="top_news_ul"]/li/a/text()'
    titles = dom.xpath(xpath_pattern)
    print('titles',titles)

    for t in titles:
        print(t)

网易新闻热点排行爬虫

import requests
from lxml import etree

url = 'https://news.163.com/'
# 新闻网站一般没有饭爬虫机制,可以不用伪造headers
resp = requests.get(url)
if resp.status_code == 200:
    html = resp.text
    # 解析html
    dom = etree.HTML(html)
    # xpath在浏览器开发工具验证成功,但是查找不出内容,返回空列表
    xpath_pattern = '//div[@class="mt35 mod_hot_rank clearfix"]/ul/li/a/text()'
    titles = dom.xpath(xpath_pattern)
    print('titles',titles)

    for t in titles:
        print(t)

网易新闻热点排行爬虫

import requests
from lxml import etree

url = 'https://news.163.com/'
# 新闻网站一般没有饭爬虫机制,可以不用伪造headers
resp = requests.get(url)
if resp.status_code == 200:
    html = resp.text
    # 解析html
    dom = etree.HTML(html)
    # xpath在浏览器开发工具验证成功,但是查找不出内容,返回空列表
    news_href_pattern = '//div[@class="mt35 mod_hot_rank clearfix"]/ul/li/a/@href'
    news_href_list = dom.xpath(news_href_pattern)
    print(news_href_list)

    for news_href in news_href_list:
        print(news_href)
        resp2 = requests.get(news_href)
        html2 = resp2.text
        dom = etree.HTML(html2)
        xpath_pattern = '//div[@class="post_body"]/p/text()'
        content = dom.xpath(xpath_pattern)
        print('content',content)

天堂图片网爬虫(单图集)

import os
import requests
from lxml import etree
home_url = 'https://www.ivsky.com/'
# 图集页面
album_url = 'https://www.ivsky.com/tupian/lugui_v62472/'

headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}
# 请求图集
resp = requests.get(album_url)
status_code = resp.status_code
print(status_code)
alnum_html = resp.text
print(alnum_html)

# 获取图集下的所有缩略图url,和图集标题
album_dom = etree.HTML(alnum_html)
title_pattern = '//h1/text()'
img_pattern = '//ul[@class="pli"]/li/div/a/img/@src'
album_title = album_dom.xpath(title_pattern)[0]
# 图集名后有空格,需要注意
album_title = album_title.strip()
img_src_list = album_dom.xpath(img_pattern)
print(album_title)
print(len(img_src_list),img_src_list)

# 用图集名创建文件夹
if not os.path.exists('./'+album_title):
    os.mkdir('./'+album_title)

# 循环图片列表进行爬取图片
for i, img_src in enumerate(img_src_list):
    img_src = 'https:' + img_src
    print(img_src)
    resp = requests.get(img_src,headers = headers)
    print(resp.status_code)
    img_content_bytes = resp.content

    #图片写入本地
    img_path = os.path.join(os.path.dirname(__file__),album_title,f'{i+1}.jpg')
    img_path = img_path.split('.')[-1]
    print(img_path)
    with open(img_path,mode='wb') as f:
        f.write(img_content_bytes)
        print(f'第{i+1}章图片保存成功,保存到了{img_path}')
封装
import os
import requests
from lxml import etree

# home_url = 'https://www.ivsky.com/'



def get_single_img(img_src, album_title, i):
    # 拼完整图片url
    img_src = 'https:' + img_src
    print(img_src)
    resp = requests.get(img_src, headers=headers)
    print(resp.status_code)
    img_content_bytes = resp.content

    # 图片二进制信息写入本地
    img_path = os.path.join(os.path.dirname(__file__), album_title, f'{i + 1}.jpg')
    print(img_path)
    with open(img_path, mode='wb') as f:
        f.write(img_content_bytes)
        print(f'第{i + 1}张图片保存完毕,保存到了{img_path}')

def get_single_album(album_url):
    # album_url = 'https://www.ivsky.com/tupian/lugui_v62472/'    # 图集页。下有缩略图。

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
    }

    # 请求图集页。
    resp = requests.get(album_url)
    status_code = resp.status_code
    print(status_code)
    album_html = resp.text
    print(album_html)

    # 获取一个图集下的所有缩略图片的url地址
    album_dom = etree.HTML(album_html)
    title_pattern = '//h1/text()'
    img_pattern = '//ul[@class="pli"]/li/div/a/img/@src'
    album_title = album_dom.xpath(title_pattern)[0]
    album_title = album_title.strip()   # 图集名可能后面有空格 '乌龟 ' mkdir()会省略空格  '乌龟'  导致后面拼路径不正确
    img_src_list = album_dom.xpath(img_pattern)
    print(album_title)
    print(len(img_src_list), img_src_list)

    # 以图集名字创建文件夹
    if not os.path.exists('./'+album_title):   # '体型庞大的陆龟图片(16张)'
        os.mkdir('./'+album_title)

    # 循环图片地址列表,请求每一张图片
    for i, img_src in enumerate(img_src_list):
        get_single_img(img_src, album_title, i)


catalog_url = 'https://www.ivsky.com/tupian/dongwutupian/index_2.html'
requests.get(catalog_url)
requests.get()
html = ''
dom = etree.HTML()
album_url_list = dom.xpath('/@href')
for album_url in album_url_list:
    get_single_album(album_url)

哇哦UI设计界面标题爬取

import requests
from lxml import etree

url = 'https://ui.1owo.com/index'

resp = requests.get(url)
if resp.status_code == 200:
    html = resp.text

    dom = etree.HTML(html)
    xpath_pattern = '//div[@id="blog-block"]//div[@class="blog-date title"]/text()'
    titles = dom.xpath(xpath_pattern)
    # print('titles',titles)
    for t in titles:
        print(t)

站酷网爬虫(多图集)

import os
import requests
from lxml import etree
home_url = 'https://www.zcool.com.cn/'
# 图集页面
album_url = 'https://www.zcool.com.cn/work/ZNTAyNTI2NzY=.html'

headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}
# 请求图集
resp = requests.get(home_url,headers = headers)
status_code = resp.status_code
home_html = resp.text

# 获取图集下的所有缩略图url,和图集标题
home_dom = etree.HTML(home_html)
title_pattern = '//div[@class="card-box"]//a/img/@title'
img_title_list = home_dom.xpath(title_pattern)
img_pattern = '//div[@class="card-img"]//a/img/@src'
# 图集名后有空格,需要注意
# album_title = album_title.strip()
img_src_list = home_dom.xpath(img_pattern)

for i in range(len(img_src_list)):
    print(len(img_src_list))
    # 用图集名创建文件夹
    if i == 15:
        break
    if not os.path.exists('站酷图集./'+str(i+1)+"."+str(img_title_list[i])):
        os.mkdir('站酷图集./'+str(i+1)+"."+str(img_title_list[i]))
    # 循环图片列表进行爬取图片
    # for j, img_src in enumerate(img_src_list):
    print(img_src_list[i])
    resp = requests.get(img_src_list[i],headers = headers)
    print(resp.status_code)
    img_content_bytes = resp.content
    #图片写入本地
    str_title = str(i+1)+'.'+str(img_title_list[i])
    img_path = os.path.join(str(os.path.dirname(__file__)),'站酷图集',str_title,f'{i+1}.jpg')
    print(img_path)
    with open(img_path,mode='wb') as f:
        f.write(img_content_bytes)
        print(f'第{i+1}张图片保存成功,保存到了{img_path}')

你可能感兴趣的:(python,pycharm,sqlite,xpath)