Python爬虫相关案例汇总

一、简述

将之前所做的爬虫案例放出,方便查阅,对代码整合函数内容并不进行说明。

二、代码

import time
from concurrent.futures import ThreadPoolExecutor

import requests
import re
import csv
from bs4 import BeautifulSoup
import os
from lxml import etree

def top250():
#re实例1
    for a in range(0,250,25):
        url="https://movie.douban.com/top250?start={0}".format(a)
        #print(url)
        headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25"
        }
        resp = requests.get(url,headers=headers)#get模式带UA防反爬
        page_content=resp.text
        obj=re.compile(r'
  • .*?(?P.*?)' r'.*?(?P.*?).*?' r'.*?
    (?P.*?) .*?' r'.*?(?P.*?)' r'.*?(?P.*?)' ,re.S) result= obj.finditer(page_content) f=open("DoubanTop250.csv",mode="a") csvwriter=csv.writer(f) for i in result: # print("排名:{0}; 电影名:{1}; 年份:{2}; 评分:{3} ".format( # i.group("ranking"), # i.group("name"), # i.group("year").strip(), # i.group("score")) # ) dic=i.groupdict() dic['year']=dic['year'].strip() csvwriter.writerow(dic.values()) f.close() time.sleep(2)#多个页面采用时延防反爬,不然ip会被ban掉 print("收集到{0}个信息".format(a+25)) def MovieDownload(): #re实例2 domain ="https://dytt89.com/" resp = requests.get(domain,verify=False)#get中特殊的verify=False处理 resp.encoding='gbk'#国标语言 #print(resp.text) f = open("Dytt2022新片精品电影下载地址.csv", mode="a") csvwriter = csv.writer(f) obj1=re.compile(r'2022新片精品.*?' r'
      (?P
        .*?)
      ',re.S) obj2=re.compile(r'''
    • (?P.*?)

  • .*?' r'' r'',re.S) child_href_list=[] result1=obj1.finditer(resp.text) for i in result1:#第一层————获取主页面代码 ul=i.group('ul') #print(ul) #time.sleep(1) result2=obj2.finditer(ul) for j in result2:#第二层————获取进入后的代码 #time.sleep(1) child_href=domain+j.group('href').strip("/")#主页面代码与子页面的特殊部分拼接 child_href_list.append(child_href) k=0 for href in child_href_list: child_resp=requests.get(href,verify=False) child_resp.encoding='gbk' result3=obj3.search(child_resp.text)#获取所需的下载内容 #print(result3.group('movie')) #print(result3.group('download')) dic = result3.groupdict() csvwriter.writerow(dic.values()) k=k+1#计数 print("已收集到{0}个电影".format(k)) f.close() def VegetableValue(): #POST实例 url = "http://www.xinfadi.com.cn/getPriceData.html" #由于页面源代码没有数据,所以F12用抓包工具获取到数据的链接使用即可 head = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25" } f = open("菜价.csv", mode="a") csvwriter = csv.writer(f) data = {'limit': 3} # 所要统计的数据数量 #get和post获取数据的区别 #get只能直接获取当前页面内所有资源 #post可以控制所要获取的资源数量data #resp = requests.get(url, headers=head).json() resp = requests.post(url, headers=head, data=data).json() lis = resp.get('list') for i in lis: name = i.get("prodName") low_price = i.get("lowPrice") high_price = i.get("highPrice") average_price = i.get("avgPrice") producing_area = i.get("place") unit = i.get("unitInfo") date = i.get("pubDate") csvwriter.writerow([name,low_price,high_price,average_price,producing_area,unit,date]) f.close() def CatchPicture(url): #bs4实例 url_download = "https://pic.netbian.com/" resp = requests.get(url) resp.encoding="gbk" #print(resp.text) main_page=BeautifulSoup(resp.text,"html.parser") alist=main_page.find("div",class_="slist").find_all("a") #print(alist) for a in alist: #print(a.get('href')) href = url_download + a.get('href').strip("/") # 主页面代码与子页面的特殊部分拼接 #print(href) child_page_resp=requests.get(href) child_page_resp.encoding='gbk' child_page_text=child_page_resp.text child_page=BeautifulSoup(child_page_text,"html.parser") img = child_page.find("div",class_="photo").find("img") img_name= child_page.find("div",class_="photo").find("img").get("title") #print(img.get("src")) src=url_download+img.get("src").strip("/") #print(src) #print(img_name) img_resp=requests.get(src) img = img_resp.content#拿到字节 with open("img2/"+img_name+".jpeg",mode="wb")as f: f.write(img) print(img_name+"下载好了!!") #break time.sleep(0.5)#防反爬必要时延 def Xpath(): #网站不知道为啥总是返回空列表找不到数据,但是自己做的html却很轻松能访问到节点目标 tree = etree.parse('file:///C:/Users/86183/Desktop/1.html') r1 = tree.xpath('/html/body/div[2]/p') # 直接从上往下挨着找节点 # /html/body/div[2]/p[1] for div in r1: # /html/body/div[2]/p[1] a = div.xpath('./text()') print(a) #浏览器中的console调用xpath的基本格式:$x("xpath表达式"),若格式正确则返回值,错误则无 def Vidio(): #梨视频防盗链破解 url = "https://www.pearvideo.com/video_1733893"#拉取视频地址 contId = url.split("_")[1]#拿到1733893 resp = requests.get(url) resp.encoding="utf-8" #print(resp.text) main_page = BeautifulSoup(resp.text,"html.parser") title = main_page.find("div",class_="box-left clear-mar").find("h1").text #print(title) header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25" # referer防盗链:溯源,访问顺序必须是1->2->3,所以加个referer模拟该状态;若是只有1->3则失效 ,"Referer": url } vidio_status = f"https://www.pearvideo.com/videoStatus.jsp?contId={contId}&mrd=0.5623242368176109" resp = requests.get(vidio_status,headers=header) #print(resp.text) dic = resp.json() #print(dic) srcUrl = dic["videoInfo"]["videos"]['srcUrl'] systemTime = dic['systemTime']#systemTime:1660186591481 # 假:https://video.pearvideo.com/mp4/adshort/20210701/1660186531481-15708981_adpkg-ad_hd.mp4 # 真:https://video.pearvideo.com/mp4/adshort/20210701/cont-1733893-15708981_adpkg-ad_hd.mp4 srcUrl_true=srcUrl.replace(systemTime,f"cont-{contId}")#将两个链接不同处对比替换掉旧链接的无用部分 #print(srcUrl_true) with open("videos/"+title+".mp4",mode= "wb")as f: f.write(requests.get(srcUrl_true).content) print(title+"下载完成!") def aiodownload(cid,title,book): url = f"https://www.23qb.com/book/{cid}.html" page = 1 with open(f"novels/{book}.txt", mode="a+") as f: f.write("\n") f.write("\n"+title+"\n") f.write("\n") while True: resp = requests.get(url).text page_thing = BeautifulSoup(resp.replace('\ufffd', ''), "html.parser") lists = page_thing.find_all("div", class_="read-content") for texts in lists: text = texts.find_all('p') del text[-1] if text[-1].string == "(继续下一页)": del text[-1] page = page+1 url = f"https://www.23qb.com/book/{cid}_{page}.html" for line in text: txt = line.string try: f.write(txt+"\n") except Exception as e: f.write("!!!!!!!!"+"\n") continue continue else: for line in text: txt = line.string try: f.write(txt + "\n") except Exception as e: f.write("!!!!!!!!" + "\n") continue break print(title + "下载完成") def getCatalog(url): resp = requests.get(url) #print(resp.text) obj1 = re.compile(r'.*?' r'
      (?P.*?)
    ',re.S) obj2 = re.compile(r'
  • (?P.*?).
  • ',re.S) main_page = resp.text result = obj1.finditer(main_page) for i in result: ul = i.group('url') book = i.group("book") #print(ul) result2= obj2.finditer(ul) for ii in result2: cid=ii.group("c_id") title = ii.group("name") aiodownload(cid,title,book) print(book+"下载完成!") if __name__ == '__main__': #top250() #MovieDownload() #VegetableValue() # CatchPicture主函数 ''' start_time = time.time() with ThreadPoolExecutor(10) as t:#这里只开启了20个线程,可以更多 for i in range (2,119): t.submit(CatchPicture,f"https://pic.netbian.com/4kdongman/index_{i}.html") time.sleep(1)#多线程记得时延防止反爬禁ip(已经被网站禁了四个ip了谢邀) print(f"第{i}页内容下载完毕")#如果控制台只显示此内容,则说明ip已经被网站反爬 print("全部下载完毕") end_time = time.time() print('总共的时间为:', round(end_time - start_time, 2), '秒') ''' # Xpath() #Vidio() # 小说下载 start_time = time.time() b_id = "116418" #input("输入你想下载的书的id:")#"60218","27309","4286","719","189697" url =f"https://www.23qb.com/book/{b_id}/" getCatalog(url) end_time = time.time() print("下载时间为:", round(end_time - start_time, 2), '秒')

    你可能感兴趣的:(python,爬虫)