Python爬取新闻网站的标题和链接存入Excel

最近爬取的一个新闻网站的标题和新闻页的链接还有发布时间。用到了BS4和re,在对要进行爬取的网页url处理时,我选择的是放入Quene中,调用。其实放入set()或者存为txt都可以。
正则用的不是太66,所以正则部分显得有点牵强。
数据存储选择为Excel,可以存为MySQL的,代码还没写,稍后会添加进来。代码还有不足之处,在做修改。
# coding:utf-8

import requests
import random
import re
from Queue import Queue
from bs4 import BeautifulSoup
from xlwt import *
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

class spider_web_news():
    def __init__(self):
        # 请求头
        user_agent = [
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36',
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36"
        ]
        self.headers = {"User_agent": random.choice(user_agent)}
        self.url = Queue()  # 将获取到的data存入Quene()中调用

    # 进入首页,获取专题页链接
    def send_req(self,url):
        response = requests.get(url,headers=self.headers)
        response.encoding = "utf-8"
        soup = BeautifulSoup(response.text, "lxml")  # 转化为Beautifulsoup对象
        # print soup.prettify()    # 打印
        for base_tag in  soup.find_all(class_="menuli"):
            base_url = re.findall('href="(.*?)"',str(base_tag))[0]
            # print base_url   # 输出各专题链接
            self.url.put(base_url)   # 将专题链接放入Quene中

    # 进入各专题页,爬取标题title (首页title)
    def send_url(self):

        while not self.url.empty():
            base_url = self.url.get()
            response = requests.get(base_url, headers=self.headers)
            response.encoding = "utf-8"
            soup = BeautifulSoup(response.text, "lxml")
            for title in soup.find_all(class_="langmubt2"):
                # print "title is ---------",title
                title = re.findall('
  • (.*?)
  • ', str(title)) for mubiao in title: print mubiao[0],mubiao[2],mubiao[3] yield mubiao[0],mubiao[2],mubiao[3] # 将数据存入Excel中 def save_title(self): print "111" num = 1 title_file = Workbook(encoding="utf-8") # 存为Excel,以 utf-8 编码打开 table = title_file.add_sheet('data') # 打开“data” 工作薄 # 存入表头 table.write(0,0,"time") table.write(0,1,"url") table.write(0,2,"title") all_data = self.send_url() # 调用send_url()中 yield 的返回值 for data in all_data: # data 数据为元组类型 table.write(num,0,data[0]) table.write(num,1,data[1]) table.write(num,2,data[2]) print "saving done..." num += 1 title_file.save("002.xls") # 保存文件 if __name__ == '__main__': url = "http://hsb.hspress.net/" A = spider_web_news() A.send_req(url) # A.send_url() A.save_title()

    你可能感兴趣的:(Python爬取新闻网站的标题和链接存入Excel)