Python:网络爬虫

目录

 

一,经典爬虫-BeautifulSoup

二,正则表达式

爬取斗破苍穹小说

爬取糗事百科

三,Lxml爬取

Excel存储:爬取中文起点小说

图片爬取:爬取妹子网图片

数据库存储:爬取豆瓣图书

多线程爬虫:爬虫爬取糗事百科

Pycharm兼容爬虫

Pycharm不兼容,必须要Idea中运行

爬虫:IP代理

手机爬虫:爬取百度文库文档

四,案例

一,爬取美团商家数据

一,非美食类

二,美食类

二,有道词典实现在线翻译


一,经典爬虫-BeautifulSoup

import requests
from bs4 import BeautifulSoup
import time    #导入库

headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}    #请求头

def get_info(url):    #获取信息的函数
    wb_data=requests.get(url,headers=headers)
    soup=BeautifulSoup(wb_data.text,'lxml')
    ranks=soup.select('span.pc_temp_num')
    titles=soup.select('div.pc_temp_songlist > ul > li > a')
    times=soup.select('span.pc_temp_tips_r > span')
    for rank,title,time,in zip(ranks,titles,times):    #获取指定组件指示的位置参数
        data={
            'rank':rank.get_text().strip(),
            'singer':title.get_text().split('-')[0],
            'song':title.get_text().split('-')[0],
            'time':time.get_text().strip()
        }
        print(data)

if __name__ == '__main__':
    urls=['http://www.kugou.com/yy/rank/home/{}-8888.html'.format(str(i)) for i in range(1,24)]    #构造23个URL,由酷狗页面前500条数据所占的页数决定

    for url in urls:
        get_info(url)
    time.sleep(1)

二,正则表达式

爬取斗破苍穹小说

import requests
import re
import time

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
f = open('G:/doupo.txt','a+')

def get_info(url):
    res = requests.get(url,headers=headers)
    if res.status_code == 200:
        contents = re.findall('

(.*?)

',res.content.decode('utf-8'),re.S) for content in contents: f.write(content+'\n') else: pass if __name__ == '__main__': urls = [http://www.doupoxs.com/doupocangqiong/{}.html'.format(str(i)) for i in range(2,3)] for url in urls: get_info(url) time.sleep(1) f.close()

爬取糗事百科

import requests
import re
headers = {
    'User-Agent' :'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
info_lists = []
def judgment_sex(class_name):
    if class_name == 'womenIcon':
        return '女'
    else:
        return  '男'
def get_info(url):
    res = requests.get(url)
    ids = re.findall('

(.*?)

' ,res.text ,re.S) levels = re.findall('
(.*?)
' ,res.text ,re.S) sexs = re.findall('
' ,res.text ,re.S) contents = re.findall('
.*?(.*?)' ,res.text ,re.S) laughs = re.findall('(\d+)' ,res.text ,re.S) comments = re.findall('(\d+) 评论' ,res.text ,re.S) for id ,level ,sex ,content ,laugh ,comment in zip(ids ,levels ,sexs ,contents ,laughs ,comments): info = { 'id' :id, 'level' :level, 'sex' :judgment_sex(sex), 'content' :content, 'laugh' :laugh, 'comment' :comment } info_lists.append(info) if __name__ == '__main__': urls = ['http://www.qiushibaike.com/text/page/{}/'.format(str(i)) for i in range(1 ,36)] for url in urls: get_info(url) for info_list in info_lists: print(info_list)

三,Lxml爬取

Excel存储:爬取中文起点小说

import xlwt
import requests
from lxml import etree
import time
all_info_list = []
def get_info(url):
    html = requests.get(url)
    selector = etree.HTML(html.text)
    infos = selector.xpath('//ul[@class="all-img-list cf"]/li')
    for info in infos:
        title = info.xpath('div[2]/h4/a/text()')[0]
        author = info.xpath('div[2]/p[1]/a[1]/text()')[0]
        style_1 = info.xpath('div[2]/p[1]/a[2]/text()')[0]
        style_2 = info.xpath('div[2]/p[1]/a[3]/text()')[0]
        style = style_1 + '·' + style_2
        complete = info.xpath('div[2]/p[1]/span/text()')[0]
        introduce = info.xpath('div[2]/p[2]/text()')[0].strip()
        word = info.xpath('div[2]/p[3]/span/text()')[0].strip('万字')
        info_list = [title, author, style, complete, introduce, word]
        all_info_list.append(info_list)
if __name__ == '__main__':
    urls = ['http://a.qidian.com/?page={}'.format(str(i)) for i in range(1, 101)]
    for url in urls:
        get_info(url)
    header = ['title', 'author', 'style', 'complete', 'introduce', 'word']
    book = xlwt.Workbook(encoding='utf-8')
    sheet = book.add_sheet('Sheet1')
    for h in range(len(header)):
        sheet.write(0, h, header[h])
    i = 1
    for list in all_info_list:
        j = 0
        for data in list:
            sheet.write(i, j, data)
            j += 1
        i += 1
    book.save('xiaoshuo.xls')

图片爬取:爬取妹子网图片

import requests
import os
from lxml import etree
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
}
path = 'G:/photo/'
def get_girlphoto(url):
    html = requests.get(url, headers=headers)
    selector = etree.HTML(html.text)

    girlphoto_urls = selector.xpath('//div/ul/li/a/img/@data-original')
    print(girlphoto_urls)
    
    for item in girlphoto_urls:
        data = requests.get(item, headers=headers)
        with open(path + item[-10:], 'wb') as f:
            f.write(data.content)
            f.close()
if __name__ == '__main__':  # 主函数
    urls = ["https://www.mzitu.com/page/{}".format(str(i)) for i in range(2, 11)]
    for url in urls:
        get_girlphoto(url)

数据库存储:爬取豆瓣图书

import requests
import time  # 导入库f
from lxml import etree
import pymysql
headers ={
    'User-Agent' :'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}  # 请求头
def get_info(url):  # 获取信息的函数
    db = pymysql.connect("localhost", "root", "123456", "music")
    cursor = db.cursor()
    wb_data =requests.get(url ,headers=headers)
    selector =etree.HTML(wb_data.text)
    names =selector.xpath('//div[@class="pl2"]/a/text()')
    titles =selector.xpath('//p[@class="pl"]/text()')
    for name ,title in zip(names ,titles):  # 获取指定组件指示的位置参数
        '''data={
            'name':name.strip(),
            'author':title.split('/')[0],
            'time':title.split('/')[1],
        }'''
        a=name.strip( )
        b=title.split('/')[0]
        c=title.split ('/')[1]
        cursor.execute("insert into  music values(' " + a + "',' " + b + "',' " + c + "')")
        print(a,b,c)
        db.commit()
    cursor.close()
    db.close()
if __name__ == '__main__':
    urls =['https://music.douban.com/top250?start={}'.format(str(i)) for i in range(25 ,225 ,25)]  # 构造23个URL,由酷狗页面前500条数据所占的页数决定
    for url in urls:
        get_info(url)

多线程爬虫:爬虫爬取糗事百科

Pycharm兼容爬虫

import requests
from queue import Queue
from lxml import etree
import threading
import time
class bsSpider(object):
    """docstring for ClassName"""

    def __init__(self):
        self.headers = {"User-Agent": "Mozilla/5.0"}
        self.url = 'http://www.budejie.com/'

        # URL队列
        self.urlQueue = Queue()
        # 响应html队列
        self.resQueue = Queue()

    def getUrl(self):
        for p in range(1, 51):
            url = self.url + str(p)
            self.urlQueue.put(url)

    def getHtml(self):
        while True:
            # 从urlQueue中拿url
            url = self.urlQueue.get()
            res = requests.get(url, headers=self.headers)
            res.encoding = "utf-8"
            html = res.text
            # 放到响应队列
            self.resQueue.put(html)
            # 清除任务
            self.urlQueue.task_done()

    def gerText(self):
        while True:
            # 从urlQueue中拿url
            html = self.resQueue.get()
            parseHtml = etree.HTML(html)
            title_list = parseHtml.xpath('//div[@class="j-r-list-c-desc"]/a/text()')
            # 放到响应队列
            for title in title_list:
                self.show(title)
            # 清除任务
            self.resQueue.task_done()

    def show(self,title):
        print(title)

    def run(self):
        thList = []
        self.getUrl()   # 50url

        # 创建请求线程,放到线程列表
        for i in range(5):
            thRes = threading.Thread(target=self.getHtml)
            thList.append(thRes)

        # 创建解析线程,放到线程列表
        for i in range(5):
            thParse = threading.Thread(target=self.gerText)
            thList.append(thParse)

        # 启动所有线程
        for th in thList:
            th.setDaemon(True)  #守护线程,保证线程单独执行
            th.start()

        self.urlQueue.join()    #判断队列是否为0,0:主线程继续执行,html 40

        self.resQueue.join()    #5个,


if __name__ == '__main__':
    spider = bsSpider()
    spider.run()

Pycharm不兼容,必须要Idea中运行

import requests
import os
from lxml import etree
import xlwt
import time
from multiprocessing import Pool
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
}
def get_girlphoto(url):
    html = requests.get(url, headers=headers)
    selector = etree.HTML(html.text)
    purl=selector.xpath("//a[@class='resule_img_a']/@href")
    for i in purl:
        html = requests.get(i, headers=headers)
        selector = etree.HTML(html.text)
        titles=selector.xpath("//h4/em/text()")
        addresses=selector.xpath("//span[@class='pr5']/text()")
        dolors=selector.xpath("//*[@id='pricePart']/div[1]/span/text()")
        people_names=selector.xpath("//a[@class='lorder_name']/@title")
        for title,address,dolor,people_name in zip(titles,addresses,dolors,people_names):
            info_list = [title,str(address).strip(),dolor,people_name]

if __name__ == '__main__':  # 主函数
    urls=["http://bj.xiaozhu.com/search-duanzufang-p{}-0/".format(str(i)) for i  in range(1,10)]
    start1=time.time()
    for i in urls:
        get_girlphoto(i)
    end1=time.time()
    print('单进程',end1-start1)

    start2=time.time()
    pool=Pool(processes=10)
    pool.map(get_girlphoto,urls)
    end2=time.time()
    print('10进程',end2-start2)

爬虫:IP代理

免费代理服务器:https://www.xicidaili.com/nn/

import requests 
proxies = { "http": "http://192.10.1.10:8080", "https": "http://193.121.1.10:9080", } 
requests.get("http://targetwebsite.com", proxies=proxies) 

手机爬虫:爬取百度文库文档

# -*- coding: utf-8 -*-
import Public
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver import ActionChains
import time
headers = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Mobile Safari/537.36"
}


def get_num(url):
    driver = webdriver.PhantomJS()
    driver.get(url)
    html = driver.page_source
    page_count_get = BeautifulSoup(html, 'lxml')
    page_count_gets = page_count_get.find(class_='page-count')
    pagecount = page_count_gets.get_text()
    num = int(pagecount[1:])
    temp = num
    page1 = 1  # 从第一页开始爬取
    # 获取结果
    all = ''
    while page1 <= temp:
        x = 'pageNo-' + str(page1)  # 当前页面元素
        html = driver.page_source
        soup = BeautifulSoup(html, 'lxml')  # 载入html
        soups = soup.find_all(id=x)  # 找到所有id为x的元素
        for each in soups:
            text = each.get_text()  # 获取元素里的文字
            all += text
        if num > 1 and text == '':  # 如果页数大于1并且获取不到文章,就点击继续阅读
            page = driver.find_element_by_css_selector("#html-reader-go-more")  # 模拟鼠标点击 ActionChain()方法
            pagebutton = driver.find_element_by_css_selector("#html-reader-go-more .banner-more-btn")  # 继续阅读按钮的元素位置
            ActionChains(driver).move_to_element(page).click(pagebutton).perform()  # 执行鼠标单击行为
            time.sleep(2)
            num = num / 5
            page1 = page1 - 1
        elif num < 1 and text == '':  # 如果页数小于1(这里指的是不需要再点击继续阅读)并且获取不到文章,就下滚
            target = driver.find_element_by_id(x)  # 找到id为x的元素,并定位,也就是当前正在爬取的页数
            driver.execute_script("arguments[0].scrollIntoView();", target)  # 模拟滚动到id为x的地方,也就是当前正在爬取的页数
            time.sleep(2)
            page1 = page1 - 1
        page1 = page1 + 1
        print(all)



四,案例

一,爬取美团商家数据

一,非美食类

import xlwt

import requests

from lxml import etree

import time
all_info_list = []
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
        'Cookie': '_lxsdk_cuid=16bd560125bc8-0f8928eb371893-e323069-1fa400-16bd560125bc8;ci=1;rvct=1;_hc.v=fcff8e99-91bf-6483-6184-9a249bc61b80.1562654914;Hm_lvt_f66b37722f586a240d4621318a5a6ebe=1562749731;__utma=211559370.351478480.1562749734.1562749734.1562749734.1;__utmz=211559370.1562749734.1.1.utmcsr=baidu|utmccn=baidu|utmcmd=organic|utmcct=zt_search;client-id=da6ade00-71c3-4821-af5f-d539b16b5955;uuid=c2d42edd0bda4f44acc5.1563193350.1.0.0;_lx_utm=utm_source%3Dbaidu%26utm_campaign%3Dbaidu%26utm_medium%3Dorganic%26utm_content%3Dzt_search;__mta=213669163.1563193361179.1563193361179.1563193361179.1;_lxsdk_s=16bf595928c-1b-bd2-c98%7C%7C5'}
def get_info(url):
    html = requests.get(url,headers=headers)
    selector = etree.HTML(html.text)
    titles = selector.xpath('//div[@class="list-item-desc-top"]/a/text()')
    address = selector.xpath('//div[@class="item-site-info clearfix"]/span[2]/text()')
    p=selector.xpath('//div[@class="item-eval-info clearfix"]/span[1]/text()')#奇数
    r=selector.xpath('//div[@class="item-price-info"]/span/text()')#偶数
    pingfens=[]
    renjuns=[]
    for i in range(0,len(p)-1,2):
        pingfens.append(p[i])
    for i in range(1, len(r) - 1,2):
        renjuns.append(r[i])

    for title,addres,pingfen,renjun in zip(titles,address,pingfens,renjuns):
        info_list=[title,addres,pingfen+"分","¥"+renjun]
        all_info_list.append(info_list)
        print(info_list)
if __name__ == '__main__':

    urls = ["https://bj.meituan.com/jiehun/c20198/pn{}/".format(str(i))
        for i in range(1, 20)]

    for url in urls:
        get_info(url)
    #get_info("https://bj.meituan.com/yundongjianshen/c20268/")
    header = ['店名', '地址', '评分', '人均']
    book = xlwt.Workbook(encoding='utf-8')
    sheet = book.add_sheet('Sheet1')
    for h in range(len(header)):
        sheet.write(0, h, header[h])
    i = 1
    for list in all_info_list:
        j = 0
        for data in list:
            sheet.write(i, j, data)
            j += 1
        i += 1
    book.save('xiaoshuo.xls')

二,美食类

import xlwt

import requests

from lxml import etree
import re
import time
all_info_list = []
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
        'Cookie':'_lxsdk_cuid=16bd560125bc8-0f8928eb371893-e323069-1fa400-16bd560125bc8;ci=1;rvct=1;_hc.v=fcff8e99-91bf-6483-6184-9a249bc61b80.1562654914;Hm_lvt_f66b37722f586a240d4621318a5a6ebe=1562749731;__utma=211559370.351478480.1562749734.1562749734.1562749734.1;__utmz=211559370.1562749734.1.1.utmcsr=baidu|utmccn=baidu|utmcmd=organic|utmcct=zt_search;client-id=da6ade00-71c3-4821-af5f-d539b16b5955;uuid=c2d42edd0bda4f44acc5.1563193350.1.0.0;_lx_utm=utm_source%3Dbaidu%26utm_campaign%3Dbaidu%26utm_medium%3Dorganic%26utm_content%3Dzt_search;__mta=213669163.1563193361179.1563193361179.1563193361179.1;_lxsdk_s=16bf595928c-1b-bd2-c98%7C%7C5' }
def get_info(url):
    html = requests.get(url,headers=headers)
    html2=str(html.text)[str(html.text).find("poiInfos"):str(html.text).find("comHeader")]
    list=html2.split("poiId")
    for i in list:
        s=str(i)[str(i).find("title"):str(i).find("avgPrice")]
        a=s[s.find("title"):s.find("avgScore")].strip('title":"')
        title=a.strip('",')
        b=s[s.find("address"):].strip('address":"')
        addres=b.strip('",')
        c=s[s.find("avgScore"):s.find("allCommentNum")].strip('avgScore":')
        pingfen=c.strip(',')
        d=s[s.find("allCommentNum"):s.find("address")].strip('allCommentNum":')
        renjun=d.strip(',')
        info_list = [title, addres, pingfen + "分", "¥" + renjun]
        all_info_list.append(info_list)
        print(info_list)
if __name__ == '__main__':
    urls = ["http://bj.meituan.com/meishi/c54/pn{}/".format(str(i))
        for i in range(1, 20)]

    for url in urls:
        get_info(url)
    header = ['店名', '地址', '评分', '人均']
    book = xlwt.Workbook(encoding='utf-8')
    sheet = book.add_sheet('Sheet1')
    for h in range(len(header)):
        sheet.write(0, h, header[h])
    i = 1
    for list in all_info_list:
        j = 0
        for data in list:
            sheet.write(i, j, data)
            j += 1
        i += 1
    book.save('xiaoshuo.xls')


二,有道词典实现在线翻译

import requests
import re
import time
import urllib3
import hashlib
headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
            'Origin':'http://fanyi.youdao.com/',  #请求头最初是从youdao发起的,Origin只用于post请求
            'Referer':'http://fanyi.youdao.com/', #Referer则用于所有类型的请求
}
class YoudaoFanyi:
    def get_info(self,key):
        data = {    #各种数据,可以通过抓包找到
            'i': key,
            'from': 'AUTO',
            'to': 'AUTO',
            'smartresult': 'dict',
            'client': 'fanyideskweb',
            'salt': '15675741889781',
            'sign': 'fa78728c931e9e682fae1c39c2b43b3a',
            'doctype':'json',
            'version':'2.1',
            'keyfrom':'fanyi.web',
            'action':'FY_BY_CLICKBUTTION', #判断按回车提交或者点击按钮提交的方式
            'typoResult':'true'
        }
        url='http://fanyi.youdao.com/translate?    smartresult=dict&smartresult=rule&sessionFrom=null'    #有道的一个接口
        res = requests.post(url, headers=headers,data=data)
        youdaojson = res.json()
        print('翻译的结果是:%s' % (youdaojson['translateResult'][0][0]['tgt']))
        time.sleep(2)


if __name__ == '__main__':
    youdao=YoudaoFanyi()
    while True:
        key = input("请输入你要翻译的文字('quit':退出): ").strip()
        if key == 'quit':
            break
        youdao.get_info(key)

 

 

 

 

 

你可能感兴趣的:(Python)