orm爬虫

#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-

import re
import requests

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine, orm
from sqlalchemy import Column, String, TEXT

DB_CONFIG = {
    'DRIVER': 'mysql',
    'HOST': '192.168.14.8',
    'USER': 'root',
    'PASSWD': '123',
    'DBNAME': 'cnnvd',
    'CHARSET': 'utf8',
    'TABLENAME': 'nsfocus',
    'PORT': 3306
    }

class VulnInfo(declarative_base()):
    __tablename__ = DB_CONFIG['TABLENAME']
    url = Column(String(255), primary_key=True)

    vuln_name = Column(String(255))
    vuln_desc = Column(TEXT)
    solution = Column(TEXT)

    score = Column(String(15))
    is_dangerous_plugin = Column(String(15))
    found_date = Column(String(31))

    cnnvd = Column(String(127))
    cve = Column(String(127))
    cncve = Column(String(127))
    bugtraq = Column(String(15))
    nsfocus = Column(String(15))
    cvss_score = Column(String(16))
    cnvd = Column(String(255))


class Spyder(object):
    baseURL = "url"
    delete_space_pattern = re.compile(r'\s')
    pattern = re.compile(r'(.*?).*?(.*?)', re.I|re.M|re.DOTALL)
    headers = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Language': 'zh-CN,zh;q=0.9',
      'Cookie': '自己补充'
    }
    nsfocus2item = {
            u'漏洞名称': 'vuln_name',
            u'漏洞描述': 'vuln_desc',
            u'解决方法': 'solution',
            u'危险分值': 'score',
            u'危险插件': 'is_dangerous_plugin',
            u'发现日期': 'found_date',
            u'CVE编号': 'cve',
            u'CNNVD编号':'cnnvd',
            u'CNCVE编号':'cncve',
            u'BUGTRAQ':'bugtraq',
            u'NSFOCUS':'nsfocus',
            u'CVSS评分':'cvss_score',
            u'CNVD编号':'cnvd',
    }

    def __init__(self):
        import pymysql
        pymysql.install_as_MySQLdb()
        import MySQLdb
        self.engine = create_engine(
                '{DRIVER}://{USER}:{PASSWD}@{HOST}:{PORT}'
                '/{DBNAME}?charset={CHARSET}'.format(**DB_CONFIG)
                )
        self.SessionClass = orm.sessionmaker(bind=self.engine)
        self.sesssion = self.SessionClass()
        if DB_CONFIG['TABLENAME'] not in self.engine.table_names():
            VulnInfo.metadata.create_all(self.engine)


    def insert(self, item):
        if not self.sesssion.query(VulnInfo).filter_by(cve=item['url']).count():
            try:
                self.sesssion.add(VulnInfo(**item))
                self.sesssion.commit()
            except Exception:
                self.sesssion.rollback()

    def start(self, offset=1200001, end=1200697):
        for i in range(offset, end):
            url = self.baseURL + str(i)
            print(url)
            try:
                response = requests.get(url, headers = self.headers, verify=False,timeout=10)
            except:
                continue
            if not response.content:
                continue
            item = {}
            item['url'] = url
            text = response.content.decode('utf8')
            res = self.pattern.findall(text)
            for one in res:
                item[self.nsfocus2item[one[0].strip()]] = one[1]
            self.insert(item)
            

if '__main__' == __name__:
    spyder = Spyder()
    spyder.start()

 

你可能感兴趣的:(orm爬虫)