python爬虫--爬取股吧前十页【阅读】【评论】【标题】【作者】【更新时间】

这是一个爬取股吧前十页数据的爬虫

import re, json
import requests


def write_to_json(infos):
    with open('movies.json', 'w', encoding='utf-8') as fp:
        json.dump(infos, fp)


# 解析页面内容
def parse_page(html_str):

    # 测试页面内容是否能拿到
    # print(html_str)
    # 正则筛选页面的原则:缩小匹配范围。
    ul_p = re.compile(r'
    (.*?)
'
, re.S) ul_content = ul_p.search(html_str).group() cite_p = re.compile(r'
    (.*?)
'
, re.S) cite_list = cite_p.findall(ul_content) ''' 阅读 评论 标题 作者 更新时间 详情页 ''' for cite in cite_list: cite_q = re.compile(r'
  • (.*?)
  • '
    , re.S) cite_list2 = cite_q.findall(cite) for cite2 in cite_list2: clk_p = re.compile(r'(.*?)', re.S) ###阅读 clk = clk_p.findall(cite2) #阅读数 read_count = clk[0].strip() print(read_count) #评论数 comment = clk[1].strip() #标题 title_p = re.compile(r'.*?class="note">(.*?)',re.S) title = title_p.search(cite2).group(1) aut_p = re.compile(r'.*?target="_blank">(.*?)') ###作者 aut = aut_p.search(cite2).group(1).strip() last_p = re.compile(r'(.*?)') ###更新时间 last = last_p.search(cite2).group(1) url_p = re.compile(r') ###详细地址 url = url_p.search(cite2).group(1) # 设置爬取数据 item = {} item['clk'] = read_count item['rev'] = comment item['sub'] = title item['aut'] = aut item['last'] = last item['url'] = url infos.append(item) return infos # 保存到json文件 def qingqiu(): for i in range(1,11): # 确定基础url base_url = f'http://guba.eastmoney.com/default,99_{i}.html' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36', } response = requests.get(base_url, headers=headers) infos = parse_page(response.text) return infos if __name__ == '__main__': # infos = [] # infos = qingqiu()#提取数据,将数据添加到infos中 # # 将数据写入文件 # write_to_json(infos) infos = json.load(open('movies.json', 'r')) ###项目会自己创建一个movies for item in infos: print(item)

    ==========================================================================
    爬取股吧前十页数据的优化版

    import requests,re,json
    #判空校验
    def get_match(match,number):
        if match:
            return match.group(number)
        return ''
    def write_to_json(infos):
        with open('guba.json','w',encoding='utf-8') as fp:
            json.dump(infos,fp)
    
    def parse_page(html_str):
        # print(html_str)
        ul_p = re.compile(r' 

    你可能感兴趣的:(爬虫)