python 获取有声段子

文章目录

          • 1. 参考文档
          • 2. 源码

1. 参考文档

https://blog.csdn.net/gklcsdn/article/details/103204955

2. 源码
# @Time : 2020/7/10 18:56
# @Author : GKL
# FileName : spider.py
# Software : PyCharm

import requests
from lxml import etree
from aip import AipSpeech
from threading import Thread


class Spider(object):
    def __init__(self):
        # self.url = 'https://duanziwang.com/page/1/'
        self.headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }
        APP_ID = '***'
        API_KEY = '***'
        SECRET_KEY = '***'

        self.client = AipSpeech(APP_ID, API_KEY, SECRET_KEY)
        self.page = 1

    def get_data(self, url):
        while self.page < 20:
            response = requests.get(url, headers=self.headers).text
            tree = etree.HTML(response)
            node_list = tree.xpath('//main[@class="col-md-8 main-content"]/article')
            for node in node_list:
                title = node.xpath('.//h1/a/text()')[0].replace('_段子网收录最新段子', '')
                print(title)
                content = node.xpath('.//div[@class="post-content"]//code/text()')[0].strip()

                # 调用百度云语音识别接口
                result = self.client.synthesis(content, 'zh',
                                          1, {
                                              'vol': 5,
                                          })

                # 识别正确返回语音二进制 错误则返回dict 参照下面错误码
                if not isinstance(result, dict):
                    with open('{}/{}.mp3'.format('video', title), 'wb') as f:
                        f.write(result)

            self.page += 1
            next_url = 'https://duanziwang.com/page/{}/'.format(self.page)
            self.get_data(next_url)


if __name__ == '__main__':
    s = Spider()
    for i in range(1, 4):
        t = Thread(target=s.get_data, args=('https://duanziwang.com/page/{}/'.format(i), ))
        t.start()

你可能感兴趣的:(spider)