爬取浙江工贸新闻页面

import requests
import chardet
from lxml import etree

def jiexi(rep):
et = etree.HTML(rep.text)
biaoti = et.xpath("//h2/text()")[0]
zuozhe = et.xpath("//div[@class=‘zz’][1]/text()")[0].split()[0].lstrip(“作者:”)
laiyuan = et.xpath("//div[@class=‘zz’][1]/text()")[0].split()[1].lstrip(“来源:”)
shijian = et.xpath("//div[@class=‘zz’][2]/text()")[0].split("\xa0\xa0")[1].lstrip(“发布时间:”)
zw = “”
for w in et.xpath("//div[@class=‘nr-content-con fl’]/div[1]//text()"):
zw = zw + w
zw = zw.split()
zhengwen = “”
for z in zw:
zhengwen = zhengwen + z
d = {}
d[“biaoti”] = biaoti
d[“zuozhe”] = zuozhe
d[“laiyuan”] = laiyuan
d[“shijian”] = shijian
d[“zhengwen”] = zhengwen
return d

url_list = []
for i in range(0, 10):
if i == 0:
url = “http://www.zjitc.net/xwzx/xyxw.htm”
url_list.append(url)
else:
url = “http://www.zjitc.net/xwzx/xyxw/” + str(359 - i) + “.htm”
url_list.append(url)

for url in url_list:
headers = {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36”}
response = requests.get(url, headers=headers)
response.encoding = chardet.detect(response.content)[“encoding”]
et = etree.HTML(response.text)
ul_list = []
ul_head = “http://www.zjitc.net/”
for li in et.xpath("//div[@class=‘right-1’]/ul/li"):
if li.xpath("./a/@href")[0].startswith("…/…/"):
ul_list.append(ul_head + li.xpath("./a/@href")[0].lstrip("…/…/"))
else:
ul_list.append(ul_head + li.xpath("./a/@href")[0].lstrip("…/"))
wenzhang = []
for ul in ul_list:
rep = requests.get(ul, headers=headers)
rep.encoding = chardet.detect(rep.content)[“encoding”]
d = jiexi(rep)
wenzhang.append(d)
print(wenzhang)

你可能感兴趣的:(python,szy的python,python)