python爬虫:爬取url+入库mongodb 通用模板

爬虫代码spider.py:

import requests
from lxml import etree

from save_mongodb import MongoClient


class Spider(object):
    def __init__(self):
        self.url = 'http://www.xxx.com/'
        self.mongo = MongoClient()
        self.name = 'xxx'
        self.headers = {}

    def spider_url(self): # 爬取url列表
        response = requests.get(url=self.url,headers=self.headers)
        html = etree.HTML(response.text)
        url_list = html.xpath('//a/@href')
        for i in range(len(url_list) - 1, -1, -1):
            if 'javascript' in url_list[i] or url_list[i] == '' or '@' in url_list[i] or '.jpg' in url_list[
                i] or '.png' in \
                    url_list[i]:
                url_list.pop(i)
            elif 'http' not in url_list[i]:
                url_list[i] = 'http:' + url_list[i]
        return url_list

    def save_url(self): # 保存url
        url_list = self.spider_url()
        for url in url_list:
            item = dict()
            item['url'] = url
            item['source'] = self.name
            self.mongo.add_document(item, self.name)
        print(self.name + '入库完毕,入库url个数:' + str(len(url_list)))


if __name__ == '__main__':
    sp = Spider()
    sp.save_url()

数据库应用代码save_mongodb.py:

import datetime

import pymongo

from config import MONGO_DB_HOST, MONGO_DB, password, username


class MongoClient(object):

    def __init__(self, mongo_uri=MONGO_DB_HOST, port=27017, mongo_db=MONGO_DB):
        # self.mongo_uri = 'mongodb://{}:{}@{}/admin?connectTimeoutMS=300&ssl=false&minPoolSize=2&maxPoolSize=10&readConcernLevel=majority&readPreference=secondary&authMechanism=SCRAM-SHA-1&localThresholdMS=30'.format(
        #     username, password, mongo_uri)
        self.mongo_uri = mongo_uri
        self.port = port
        self.mongo_db = mongo_db
        self.client = pymongo.MongoClient(self.mongo_uri, connect=True, unicode_decode_error_handler='ignore')
        self.db = self.client[self.mongo_db]

    def add_document(self, item, date):
        self.db[date].insert(item)

    def find_document(self, id, date):
        collist = self.db.list_collection_names()
        if date in collist:
            key = self.db[date].find_one({'id': id})
            return key
        else:
            return None

    def find_same_url(self, url, source):
        collist = self.db.list_collection_names()
        for col in collist:
            key = self.db[col].find_one({'url': url, 'source': source})
            if key is not None:
                return False
        return True

    def find_max_id(self, date):
        collist = self.db.list_collection_names()
        if date in collist:
            max_data = self.db[date].find().sort([('id', -1)]).next()
            max_id = max_data['id']
            return max_id
        else:
            return None

    def close_client(self):
        self.client.close()


if __name__ == '__main__':
    def getToday():
        """
        获取前一天的年月日
        :return: 昨天的日期
        """
        today = datetime.date.today()
        return today


    mc = MongoClient()
    mc.find_max_id(getToday().strftime('%Y-%m-%d'))
    mc.close_client()

配置文件config.py:

from urllib import parse

username = parse.quote_plus("xxx")
password = parse.quote_plus("xxx")

MONGO_DB_HOST = 'localhost'
MONGO_DB = 'benign_url'

你可能感兴趣的:(Python,Python,Web,python,爬虫)