记一次爬虫项目-杭州19楼

这个项目我爬的是19楼中我要爆料模块,具体代码如下:

import requests
import re
import time
import random
import threading
import pymysql
from lxml import etree
from bs4 import BeautifulSoup

def lou_spider(key=None):
    url_str = 'https://www.19lou.com/forum-269-{}.html'
    def child_spider(i):
        url_ = url_str.format(i)
        # time.sleep(random.uniform(2, 3))
        result = requests.get(url_)
        html = etree.HTML(result.text)
        divs = html.xpath("//div[@class='list-data  ']//div[@class='list-data-item second-data']")
        for child in divs:
            try:
                title = child.xpath(".//div[@class='title']//a//span/text()")[0]
                url1 = child.xpath(".//div[@class='title']//a/@href")[0]
                author = child.xpath(".//div[@class='author']/a//text()")[0]
                time_ = child.xpath(".//div[@class='author']/span/text()")[0]
                read_num = child.xpath(".//div[@class='num numeral']//span[@class='num-read']/text()")[0]
                reply_num = child.xpath(".//div[@class='num numeral']//span[last()]/text()")[0]
                url = 'https:' + url1
                header = {
                    'Cookie': '_DM_SID_=738487cdc9f8053671d57e15661d8c0b; _Z3nY0d4C_=37XgPK9h; M_SMILEY_TIP_HIDE=1; JSESSIONID=C5D583BF5D7C3101392F866D29E757EC; _DM_S_=d5f197ec119fc7cd98d17e27bb471f46; f19big=ip49; PHPSESSID=b4a291697b991890adcd6971cc80d43f; f9bigsec=u105; fr_adv=; Hm_lvt_2dda593f15c4767a276450bb2c252b5b=1551775709; Hm_lpvt_2dda593f15c4767a276450bb2c252b5b=1551775709; f9big=u42; reg_source=baidu.com; reg_kw=; reg_first=https%253A//www.19lou.com/; Hm_lvt_5185a335802fb72073721d2bb161cd94=1551418561,1551750677,1551943044; screen=1423; pm_count=%7B%7D; fr_adv_last=merry_thread_pc; reg_step=18; _dm_userinfo=%7B%22uid%22%3A0%2C%22stage%22%3A%22%22%2C%22city%22%3A%22%E6%B5%99%E6%B1%9F%3A%E6%9D%AD%E5%B7%9E%22%2C%22ip%22%3A%2260.191.114.2%22%2C%22sex%22%3A%222%22%2C%22frontdomain%22%3A%22www.19lou.com%22%2C%22category%22%3A%22%22%7D; dayCount=%5B%5D; _dm_tagnames=%5B%7B%22k%22%3A%22%E6%88%91%E5%90%AC%E8%AF%B4%E7%9A%84%22%2C%22c%22%3A1%7D%2C%7B%22k%22%3A%22%E6%9D%AD%E5%B7%9E%E6%9C%80%E6%96%B0%E6%96%B0%E9%97%BB%22%2C%22c%22%3A29%7D%2C%7B%22k%22%3A%22%E8%8D%89%E6%A0%B9%E7%A4%BE%E5%8C%BA%22%2C%22c%22%3A29%7D%2C%7B%22k%22%3A%22%E6%88%91%E8%A6%81%E7%88%86%E6%96%99%22%2C%22c%22%3A33%7D%2C%7B%22k%22%3A%22%E6%B8%A3%E7%94%B7%E5%87%BA%E8%BD%A8%22%2C%22c%22%3A1%7D%2C%7B%22k%22%3A%22%E5%BE%AE%E7%88%B1%E5%BF%83%22%2C%22c%22%3A2%7D%2C%7B%22k%22%3A%22%E6%9D%AD%E5%B7%9E%E5%AE%9E%E7%94%A8%E4%BF%A1%E6%81%AF%22%2C%22c%22%3A1%7D%2C%7B%22k%22%3A%22%E7%94%9F%E6%B4%BB%E5%AE%9E%E7%94%A8%E4%BF%A1%E6%81%AF%22%2C%22c%22%3A1%7D%2C%7B%22k%22%3A%22%E5%8F%A3%E6%B0%B4%E4%B9%90%E5%9B%AD%22%2C%22c%22%3A1%7D%2C%7B%22k%22%3A%22%E6%9D%AD%E5%B7%9E%E6%B2%BB%E5%AE%89%22%2C%22c%22%3A1%7D%2C%7B%22k%22%3A%22%E6%9D%AD%E5%B7%9E%E4%BA%A4%E9%80%9A%22%2C%22c%22%3A1%7D%2C%7B%22k%22%3A%22%E6%9D%AD%E5%B7%9E%E4%BA%A4%E9%80%9A%E6%B2%BB%E5%AE%89%22%2C%22c%22%3A1%7D%2C%7B%22k%22%3A%22%E5%9F%8E%E5%B8%82%E5%A4%A7%E7%AE%A1%E5%AE%B6%22%2C%22c%22%3A3%7D%2C%7B%22k%22%3A%22%E7%A4%BE%E4%BC%9A%22%2C%22c%22%3A1%7D%2C%7B%22k%22%3A%22%E5%9E%83%E5%9C%BE%22%2C%22c%22%3A9%7D%2C%7B%22k%22%3A%22%E8%BD%AC%E5%9F%BA%E5%9B%A0%22%2C%22c%22%3A1%7D%2C%7B%22k%22%3A%22%E5%BA%9F%E7%89%A9%22%2C%22c%22%3A1%7D%2C%7B%22k%22%3A%22%E6%96%B0%E4%BA%BA%E8%81%8A%E5%A4%A9%E5%AE%A4%22%2C%22c%22%3A2%7D%2C%7B%22k%22%3A%22%E5%8F%B0%E5%B7%9E%E6%B6%88%E6%81%AF%22%2C%22c%22%3A3%7D%2C%7B%22k%22%3A%22%E5%8F%B0%E5%B7%9E%E6%96%B0%E9%97%BB%E7%BD%91%22%2C%22c%22%3A3%7D%5D; Hm_lpvt_5185a335802fb72073721d2bb161cd94=1552534759',
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
                }
                time.sleep(random.uniform(1.5,2.0))
                r = requests.request('GET', url,headers=header)
                soup = BeautifulSoup(r.text, 'lxml')
                results = soup.find('div', {'class': 'thread-cont'}).get_text().strip().replace('>', '').replace('/','').replace('"', '').replace('\n', '').replace(' ', '').replace(' ', '')
                text = re.sub('[A-Za-z]+', '', results).rstrip('0123456789:%.=-_;()&,')
                print('title==>', title)
                print('url==>', url)
                print('author==>', author)
                print('time==>', time_)
                print('read==>', read_num)
                print('reply==>', reply_num)
                print('text', text)
            except Exception as e:
                pass

    for i in range(1,5):
        # thread_func = threading.Thread(target=child_spider,args=(i,))
        # thread_func.start()
        child_spider(i)


lou_spider()

在爬的过程中我遇到了以下几个问题,第一个,我刚开始的时候requests请求后内容页面的数据拿不到,所以我就转为用selenium,问题是解决了,但是这个项目要部署到centos7.5的服务器上,服务器上需要安装selenium+chrome driver环境,我是整整安装了两天,各种采坑,最后还是没有安好,所以我就不用selenium,还是用requests,加上了请求头就好了(我之前也加了请求头,没有结果,可能是当时加的时候出错了)。第二个就是IP被封的问题,因为怕的太快IP被封,我在这设置了sleep,好像还是会被封,我还在实验中,如果不行就加上代理,我还用了多进程,可以提高爬取速度。

你可能感兴趣的:(爬虫)