【Python爬虫】人民日报科技1

爬取新闻标题,新闻简述,标签并写进csv文件

#__author:'cuiwnehao'__
#coding:utf-8
import requests,csv
from lxml import etree

root_url='http://scitech.people.com.cn'
def get_urls():
    base_url='http://scitech.people.com.cn/index{}.html#fy01'
    urls=[]
    for page in range(0,13):
        req_url=base_url.format(str(page))
        #print(req_url)
        urls.append(req_url)
    return urls

def parse_data(url):
    req = requests.get(url)
    if req.status_code==200:
        req.encoding = "GB2312"
        html = req.text
        selector=etree.HTML(html)
        infos=selector.xpath('//div[@class="hdNews clearfix"]')
        for info in infos:
            item={}
            short_contents = info.xpath('div[@class="on"]/em/a/text()')
            if short_contents:
                title=info.xpath('div[@class="on"]/h5/a/text()')[0]
                title_url=root_url+info.xpath('div[@class="on"]/h5/a/@href')[0]
                short_content=short_contents[0]
                tags = '-'.join(info.xpath('h6/em[@class="gray"]/a/text()'))
                #print(title,title_url,short_contents,tags)
                item['url'] = url
                item['title_url'] = title_url
                item['title'] = title
                item['short_content'] = short_content
                item['tags'] = tags
                datas.append(item)
    return datas

if __name__=="__main__":
    urls=get_urls()
    global datas
    datas=[]
    for url in urls:
        parse_data(url)

    with open('datas1.csv','w',encoding='utf-8',newline='') as f:
        writer=csv.writer(f)
        writer.writerow(datas[0].keys())
        for data in datas:
            writer.writerow(data.values())

你可能感兴趣的:(【Python爬虫】人民日报科技1)