爬虫项目学习
煎蛋网标题
import requests
from lxml import etree
url = 'http://jandan.net/'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}
resp = requests.get(url,headers = headers)
print(resp.status_code)
if resp.status_code == 200:
html = resp.text
dom = etree.HTML(html)
xpath_pattern = '//div[@class="post f list-post"]/div[@class="indexs"]/h2/a/text()'
titles = dom.xpath(xpath_pattern)
print('titles',titles)
for t in titles:
print(t)
网易新闻头部爬虫
import requests
from lxml import etree
url = 'https://news.163.com/'
resp = requests.get(url)
if resp.status_code == 200:
html = resp.text
print(html)
dom = etree.HTML(html)
xpath_pattern = '//ul[@class="top_news_ul"]/li/a/text()'
titles = dom.xpath(xpath_pattern)
print('titles',titles)
for t in titles:
print(t)
网易新闻热点排行爬虫
import requests
from lxml import etree
url = 'https://news.163.com/'
resp = requests.get(url)
if resp.status_code == 200:
html = resp.text
dom = etree.HTML(html)
xpath_pattern = '//div[@class="mt35 mod_hot_rank clearfix"]/ul/li/a/text()'
titles = dom.xpath(xpath_pattern)
print('titles',titles)
for t in titles:
print(t)
网易新闻热点排行爬虫
import requests
from lxml import etree
url = 'https://news.163.com/'
resp = requests.get(url)
if resp.status_code == 200:
html = resp.text
dom = etree.HTML(html)
news_href_pattern = '//div[@class="mt35 mod_hot_rank clearfix"]/ul/li/a/@href'
news_href_list = dom.xpath(news_href_pattern)
print(news_href_list)
for news_href in news_href_list:
print(news_href)
resp2 = requests.get(news_href)
html2 = resp2.text
dom = etree.HTML(html2)
xpath_pattern = '//div[@class="post_body"]/p/text()'
content = dom.xpath(xpath_pattern)
print('content',content)
天堂图片网爬虫(单图集)
import os
import requests
from lxml import etree
home_url = 'https://www.ivsky.com/'
album_url = 'https://www.ivsky.com/tupian/lugui_v62472/'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}
resp = requests.get(album_url)
status_code = resp.status_code
print(status_code)
alnum_html = resp.text
print(alnum_html)
album_dom = etree.HTML(alnum_html)
title_pattern = '//h1/text()'
img_pattern = '//ul[@class="pli"]/li/div/a/img/@src'
album_title = album_dom.xpath(title_pattern)[0]
album_title = album_title.strip()
img_src_list = album_dom.xpath(img_pattern)
print(album_title)
print(len(img_src_list),img_src_list)
if not os.path.exists('./'+album_title):
os.mkdir('./'+album_title)
for i, img_src in enumerate(img_src_list):
img_src = 'https:' + img_src
print(img_src)
resp = requests.get(img_src,headers = headers)
print(resp.status_code)
img_content_bytes = resp.content
img_path = os.path.join(os.path.dirname(__file__),album_title,f'{i+1}.jpg')
img_path = img_path.split('.')[-1]
print(img_path)
with open(img_path,mode='wb') as f:
f.write(img_content_bytes)
print(f'第{i+1}章图片保存成功,保存到了{img_path}')
封装
import os
import requests
from lxml import etree
def get_single_img(img_src, album_title, i):
img_src = 'https:' + img_src
print(img_src)
resp = requests.get(img_src, headers=headers)
print(resp.status_code)
img_content_bytes = resp.content
img_path = os.path.join(os.path.dirname(__file__), album_title, f'{i + 1}.jpg')
print(img_path)
with open(img_path, mode='wb') as f:
f.write(img_content_bytes)
print(f'第{i + 1}张图片保存完毕,保存到了{img_path}')
def get_single_album(album_url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}
resp = requests.get(album_url)
status_code = resp.status_code
print(status_code)
album_html = resp.text
print(album_html)
album_dom = etree.HTML(album_html)
title_pattern = '//h1/text()'
img_pattern = '//ul[@class="pli"]/li/div/a/img/@src'
album_title = album_dom.xpath(title_pattern)[0]
album_title = album_title.strip()
img_src_list = album_dom.xpath(img_pattern)
print(album_title)
print(len(img_src_list), img_src_list)
if not os.path.exists('./'+album_title):
os.mkdir('./'+album_title)
for i, img_src in enumerate(img_src_list):
get_single_img(img_src, album_title, i)
catalog_url = 'https://www.ivsky.com/tupian/dongwutupian/index_2.html'
requests.get(catalog_url)
requests.get()
html = ''
dom = etree.HTML()
album_url_list = dom.xpath('/@href')
for album_url in album_url_list:
get_single_album(album_url)
哇哦UI设计界面标题爬取
import requests
from lxml import etree
url = 'https://ui.1owo.com/index'
resp = requests.get(url)
if resp.status_code == 200:
html = resp.text
dom = etree.HTML(html)
xpath_pattern = '//div[@id="blog-block"]//div[@class="blog-date title"]/text()'
titles = dom.xpath(xpath_pattern)
for t in titles:
print(t)
站酷网爬虫(多图集)
import os
import requests
from lxml import etree
home_url = 'https://www.zcool.com.cn/'
album_url = 'https://www.zcool.com.cn/work/ZNTAyNTI2NzY=.html'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}
resp = requests.get(home_url,headers = headers)
status_code = resp.status_code
home_html = resp.text
home_dom = etree.HTML(home_html)
title_pattern = '//div[@class="card-box"]//a/img/@title'
img_title_list = home_dom.xpath(title_pattern)
img_pattern = '//div[@class="card-img"]//a/img/@src'
img_src_list = home_dom.xpath(img_pattern)
for i in range(len(img_src_list)):
print(len(img_src_list))
if i == 15:
break
if not os.path.exists('站酷图集./'+str(i+1)+"."+str(img_title_list[i])):
os.mkdir('站酷图集./'+str(i+1)+"."+str(img_title_list[i]))
print(img_src_list[i])
resp = requests.get(img_src_list[i],headers = headers)
print(resp.status_code)
img_content_bytes = resp.content
str_title = str(i+1)+'.'+str(img_title_list[i])
img_path = os.path.join(str(os.path.dirname(__file__)),'站酷图集',str_title,f'{i+1}.jpg')
print(img_path)
with open(img_path,mode='wb') as f:
f.write(img_content_bytes)
print(f'第{i+1}张图片保存成功,保存到了{img_path}')