斗鱼直播实时数据爬取

思路

1, 解析URL

页面解析

2, 利用爬虫神器 bs4 和 正则表达式得到想要的信息;
3, 进库和本地保存

DJango后台展示和本地CSV(卖相太差,不发了)

Django后台部分数据展示

* 存储本地的CSV 直接运行 DySpyder().summary_data180() 即可*

直接上代码

# -*- coding: utf-8 -*-
import os
import re
import django
import urllib.request as ur

class DySpyder():

    def __init__(self):
        pass

    def open_url(self, url):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
        req = ur.Request(url=url, headers=headers)  # python2,urllib.request()
        response = ur.urlopen(req)  # python2,urllib2.urlopen()
        return response.read().decode('utf-8')

    def tv_spyder(self):
        ## 通过 xhr 请求可以看到
        url = "https://www.douyu.com/directory/all/?page=1&isAjax=1"
        data = self.open_url(url)
        from bs4 import BeautifulSoup
        cate = ['', 'serach_lastli', 'last','lastserach_lastli']  # - - s- l - ll 6loop
        soup1 = BeautifulSoup(data, 'html.parser')
        soup = soup1.find("ul", id='live-list-contentbox')
        res = []
        for c in cate:
            tmp = soup.findAll('li', c)
            res.extend(tmp)
        return res

    import datetime
    def set_data(self, x):
        import datetime
        res = {}
        # title.__init__
        title0 = str(x.find("h3").next_element)
        spans = x.findAll(["span"])
        # basic info to the link
        tag, dy_name, dy_num = tuple([s.next_element for s in spans][2:5])
        parterb = r'''.*.*'''
        # the urls of img and gif
        img, gif = re.findall(parterb, repr(x))[0]
        p2 = r'''.*a class="play-list-link" (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?)>.*'''
        t1 = [x for x in re.findall(p2, repr(x))][0]
        # the head of link-info
        for i in range(int(len(t1)/2 - 1)):
            res.setdefault(t1[2*i], t1[2*i+1])
        res.setdefault("dt", datetime.datetime.today())
        res.setdefault('tag', tag)
        res.setdefault('dy_name', dy_name)
        res.setdefault('dy_num', dy_num)
        res.setdefault('title0', title0)
        res.setdefault('img', img)
        res.setdefault('gif', gif)

        return res

    def summary_data180(self):
        l = [self.set_data(x) for x in self.tv_spyder()]
        import pandas as pd
        df_tmp = pd.DataFrame(l)
        df_tmp.to_csv("C:\\Users\\lenovo\\Desktop\\dy180.csv")
        return df_tmp

    #print(summary_data180())

    def main(self):
        os.environ.setdefault("DJANGO_SETTINGS_MODULE", "minicms.settings")
        django.setup()

        from tv.models import Info
        from django.utils import timezone

        df = self.summary_data180()
        print(df.columns)
        import numpy as np
        array2 = np.array(df)
        for i in range(len(df)):
            Info.objects.create(data_rid=array2[i][0],
                                data_rpos=array2[i][1],
                                data_sid=array2[i][2],
                                data_sub_rt=array2[i][3],
                                data_tid=array2[i][4],
                                dt=timezone.now(), ##修改了时间
                                dy_name=array2[i][6],
                                dy_num=array2[i][7],
                                gif=array2[i][8],
                                href=array2[i][9],
                                img=array2[i][10],
                                tag=array2[i][11],
                                target=array2[i][12],
                                title0=array2[i][13]
                                )
        print("执行完毕")


dyspyder = DySpyder()
#dyspyder.main()

没有 Django 模板, 爬取所有的模板2

import os
import re
import urllib.request as ur

class DySpyder():

    def __init__(self, url):
        self.url = url

    def open_url(self, url):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
        req = ur.Request(url=url, headers=headers)  # python2,urllib.request()
        response = ur.urlopen(req)  # python2,urllib2.urlopen()
        return response.read().decode('utf-8')

    def from_url_get_all_lis(self):
        data = self.open_url(self.url)
        from bs4 import BeautifulSoup
        soup1 = BeautifulSoup(data, 'html.parser')
        soup = soup1.findAll("li")
        return soup

    def tv_spyder(self, x):
        rid = re.findall(""".*?data-rid="(.*?)".*""", str(x))[0]
        title = re.findall(""".*?title=(.*?)>.*""", str(x))[0]
        href = re.findall(""".*?href="(.*?)".*""", str(x))[0]
        pic = re.findall('''.*?, str(x))[0]
        tag = re.findall('''.*(.*?).*''', str(x))[0]
        name = re.findall('''.*(.*?).*''', str(x))[0]
        see_num = re.findall('''.*(.*?).*''', str(x))[0]
        t = rid, pic, title, tag, name, see_num, href
        return t

def get_url(page):
    return "https://www.douyu.com/directory/all?page="+ str(page) +"&isAjax=1"

res1 = []
for i in [j+1 for j in range(20)]:
    douyu = DySpyder(get_url(i))
    for x in douyu.from_url_get_all_lis():
        try:
            res1.append(list(douyu.tv_spyder(x)))
        except:
            print(x)
import pandas as pd
import numpy as np
df = pd.DataFrame(np.array(res1))
df.to_csv("demo.csv")

后续

  • 随着时间更新, 每隔 10min 自动一次到数据库——可以获取 Tag 或者用户的规律
  • 增加 虎牙-战旗-龙珠 的数据
  • 增加图片进库和自己定义的页面实时优化; 实现多直播平台的归一化推荐

Django 存库模板

from django.db import models

# Create your models here.

class Info(models.Model):
    data_rid = models.CharField("房间ID", max_length=20)
    data_rpos = models.CharField("", max_length=20)
    data_sid= models.CharField("", max_length=20)
    data_sub_rt = models.CharField("", max_length=20)
    data_tid = models.CharField("", max_length=20)
    dt = models.DateTimeField("时间")
    dy_name = models.CharField("账号名字", max_length=50)
    dy_num = models.CharField("观看数", max_length=20)
    gif= models.CharField("GIF", max_length=120)
    href = models.CharField("房间url", max_length=20)
    img = models.CharField("IMG_url", max_length=120)
    tag = models.CharField("标签", max_length=120)
    target = models.CharField("目标", max_length=20)
    title0 = models.CharField("标题", max_length=120)

    def __str__(self):
        return self.dy_name + "_" + self.title0

    class Meta:
        verbose_name = '斗鱼时间信息'
        verbose_name_plural = '斗鱼时间信息180条'

class ImgTools(models.Model):
    img_url = models.URLField(verbose_name="线上路径")
    dt = models.DateTimeField("时间")
    data_rid = models.CharField("房间ID", max_length=20)
    upload_to = models.URLField(verbose_name="本地路径")

TXT 爬取更新

def find_min(nums):
    for i in range(len(nums)):
        if nums[i+1] > nums[i]:
            return i, nums[i]

def set_urls(book_id):
    url = "http://www.biqudu.com/" + book_id + "/"
    partern = r".*
(.*?)
.*"
import pandas as pd import numpy as np ## 本方法不能分卷, 后续补上相关的事情优化 df1 = pd.DataFrame(np.array(re.findall(partern, open_url(url))), columns=["url", "title"]) df1["num"] = [int(list(re.findall(r".*/(.*?).html", x))[0]) for x in df1["url"]] ####### find all-span start_index = find_min(df1["num"])[0] return df1[start_index: len(df1)] # 为单独一个小说页面爬取;txt 文档; content; def detail(): url = "http://www.biqudu.com/21_21470/1394112.html" data = open_url(url) from bs4 import BeautifulSoup soup = BeautifulSoup(data, 'html.parser') content = soup.findAll('div', id="content")[0] return content # print(detail()) def test(request): content = detail() return render(request, "base_test.html", {"content": content})

* 近期会花精力弄微信小程序, 爬虫放置一段时间。 ==== END ====*

你可能感兴趣的:(PythonFrame,django,数据,爬虫)