爬虫小练习--爬取腾讯新闻首页并排序

import requests
import re

url = 'https://news.qq.com/'

headers = {
    'User-Agent':'Mozilla/5.0 (

Windows NT 10.0; WOW64) AppleWebKit/537.36 (

KHTML, like Gecko) Chrome/70.0.3538.9 Safari/537.36',

}

response = requests.get(url=url, headers=headers)

root_pattern = '

([\d\D]*?)
'
two_pattern = '([\d\D]*?)'
three_pattern = '.html">(.*?)'


root_html = response.text
first_html = re.findall(root_pattern, root_html)
first_html = ''.join(first_html)

two_html = re.findall(two_pattern, first_html)
two_html = ''.join(two_html)

three_html = re.findall(three_pattern, two_html)


# a=1     调试用
# print(two_html)
# print(three_html)

# def my_news():
#     for x in range(len(three_html)):
#         print('%d:' % (x + 1) + three_html[x])

#写入新文件中
with open('tengxun.txt', 'w') as fb:
    for x in range(len(three_html)):
        fb.write('%d :' % (x+1)+three_html[x]+'\n')


 

你可能感兴趣的:(爬虫小练习--爬取腾讯新闻首页并排序)