滚动资讯的爬取

import requests
import time,json,re,pymysql
from lxml import etree
headers = {
“User-Agent”: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36’, }
for num in range(1,105):
try:
url_list=‘http://www.techweb.com.cn/roll/list_{}.shtml’.format(num)
response=requests.get(url_list,headers=headers)
response.encoding=(“utf-8”)
test=response.text
res_html=etree.HTML(test)
titles=res_html.xpath(’’’//div[@class=“newslist”]/ul/li/span[1]/a/text()’’’)
title_list=[]
for title_1 in titles:
title=title_1.lstrip("\n").lstrip().rstrip()
title_list.append(title)
#print(repr(title))
#print(len(title_list))
kinds=res_html.xpath(’’’//div[@class=“newslist”]/ul/li/span[2]/a/text()’’’)
kind_list=[]
for kind_1 in kinds:
kind=kind_1.lstrip("\n").lstrip().rstrip()
kind_list.append(kind)
#print(repr(kind_1))
#print(len(kind_list))
url_list=res_html.xpath(’’’//div[@class=“newslist”]/ul/li/span[1]/a/@href’’’)
#print(len(url_list))
sources=res_html.xpath(’’’//div[@class=“newslist”]/ul/li/span[3]/text()’’’)
source_lsit=[]
for source_1 in sources:
source=source_1.lstrip("\n").lstrip().rstrip()
source_lsit.append(source)
#print(repr(source))
#print(len(source_lsit))
creat_times=res_html.xpath(’’’//div[@class=“newslist”]/ul/li/span[4]/text()’’’)
times_lsit=[]
for creat_time_1 in creat_times:
creat_time=creat_time_1.lstrip("\n").lstrip().rstrip()
times_lsit.append(creat_time)
#print(repr(creat_time))
#print(len(times_lsit))
for i in range(0,len(url_list)):
title=str(title_list[i])
print(repr(title))
source=str(source_lsit[i])
print(repr(source))
creat_time=str(times_lsit[i])
print(repr(creat_time))
kind=str(kind_list[i])
print(repr(kind))
url_info=str(url_list[i])
print(repr(url_info))
response_info=requests.get(url_info,headers=headers)
response_info.encoding=(“utf-8”)
info_text=response_info.content
print(info_text)
html_xpth_info=etree.HTML(info_text)
article=html_xpth_info.xpath(’’’//div[@id=“content”]/p/text()|//div[@id=“content”]/p/strong/text()|//div[@id=“content”]/p/strong/span/text()’’’)
real_article="".join(article)
article_count=len(real_article)
print(repr(real_article))
#print(real_article)
paragraph=len(article)
print(article)
#print(article_count)
img=html_xpth_info.xpath(’’’//div[@id=“content”]/p/img’’’)
#print(img)
img_count=len(img)
print(img_count)
labels=html_xpth_info.xpath(’’’//div[@class=“tags”]/a/text()’’’)
if len(labels) > 0:
label = []
for label_1 in labels:
label_2 = label_1.lstrip("\n").lstrip().rstrip()
label.append(label_2)
#print(repr(label_2))
label=str(label)
else:
label=“空”
print(label)
db = pymysql.connect(host=‘127.0.0.1’, user=‘root’, password=‘123456’, database=‘key_word’, charset=‘utf8’)
# 创建游标对象
cursor = db.cursor()
sql = ‘’‘insert into roll_head values(null,"%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")’’’ % (title, source, creat_time, url_info, real_article, article_count, img_count, label, kind,paragraph)
cursor.execute(sql)
# 提交
db.commit()
# 关闭游标
cursor.close()
db.close()
except:
print(“error”)

你可能感兴趣的:(爬虫)