python 网站CDN

#coding:utf-8
import os
import sys
import tld
import time
import chardet

import get_header
import random
import socket
import requests
import builtwith
import dns.resolver
import urllib2
import pymongo
import urlparse
import mongo
from BeautifulSoup import BeautifulSoup  
#from Config import FileConfig


add_time = time.strftime('%Y-%m-%d',time.localtime(time.time()))

class Url_Check(object):
    def __init__(self, url):
        super(Url_Check, self).__init__()
        self.cdninfo()
        self.url = url
        self.cnames = []
        self.headers = []

    def get_cnames(self): # get all cname
        furl = urlparse.urlparse(self.url)
        url = furl.netloc
        # print url

        rsv = dns.resolver.Resolver()
        # rsv.nameservers = ['114.114.114.114']
        try:
            answer = dns.resolver.query(url,'CNAME')
        except Exception as e:
            self.cnames = None
            # print "ERROR: %s" % e
        else:
            cname = [_.to_text() for _ in answer][0]
            self.cnames.append(cname)
            self.get_cname(cname)

    def get_cname(self,cname): # get cname
        try:
            answer = dns.resolver.query(cname,'CNAME')
            cname = [_.to_text() for _ in answer][0]
            self.cnames.append(cname)
            self.get_cname(cname)
        except dns.resolver.NoAnswer:
            pass
    #----------------------------------------------------------------------
    def conn_url(self):
        """"""
        try:
            
            req = urllib2.Request(self.url,headers=get_header.get_header())
            resp = urllib2.urlopen(req)
        except Exception as e:
            print '[-] self.url:' + self.url 
            print str(e)
        else:
            return resp
    
    def get_headers(self): # get header
        try:
            resp = self.conn_url()
        except Exception as e:
            self.headers = None
            # print "ERROR: %s" % e
        else:
            headers = str(resp.headers).lower()
            self.headers = headers
            
            
    #----------------------------------------------------------------------
    def get_ip(self):
        """"""
        try:
            domain_url = str(self.url.strip())[7:]
            ip_url = socket.getaddrinfo(domain_url,'http')[0][4][0]
            #ip_url = socket.gethostbyname(url)
            return ip_url
        except Exception,e:
            pass
        
        
    #----------------------------------------------------------------------
    def get_title(self):
        """"""
        try:
        
            html = urllib2.urlopen(self.url).read()
            encoding = str(chardet.detect(html)['encoding'])
        
            if encoding == 'GB2312':
                soup = BeautifulSoup(html,fromEncoding="GB18030")  
        
            else:
                soup = BeautifulSoup(html,fromEncoding=encoding)  
            #print url.strip() +':'+ str(encoding) +':'+ soup.title.string
            return soup.title.string
        except Exception,e:
            print str(e)
        
    #----------------------------------------------------------------------
    def get_cms_url(self):
        """"""
        try:
            cms_url = builtwith.parse(self.url) 
        except Exception,e:
            
            pass 
        else:
            #print 'cms_rule succccesss'
            return cms_url
            
    #----------------------------------------------------------------------
    def matched(self, context, *args): # Matching string 
        if not isinstance(context, basestring):
            context = str(context)

        func = lambda x, y: y in x
        # if any(func(context, pattern) for pattern in args):
        #     return True
        # else:
        #     return False
        for pattern in args:
            if func(context,pattern):
                return pattern
        return False

    def check(self):
        try:
            flag = None
            self.get_cnames()
            self.get_headers()            
            if self.cnames:
                # print self.cnames
                flag = self.matched(self.cnames,*self.cdn['cname'])
                if flag:
                    print '[+]  ' + self.url + flag
                    return {'Status':True, 'CDN':self.cdn['cname'].get(flag)}
            if not flag and self.headers:
                flag = self.matched(self.headers,*self.cdn['headers'])
                if flag:             
                    return {'Status':True, 'CDN':'unknown'}
            return {'Status':False, 'CNAME':self.cnames, 'Headers':self.headers}
        except Exception,e:
            pass

    def cdninfo(self):
        self.cdn = {
            'headers': set([
 
    #----------------------------------------------------------------------
    def update_mongo(self):
        """"""
        cms_url = self.get_cms_url()
        title_url = self.get_title()
        ip_url = self.get_ip()
        cdn_url = self.check()
        mongo.ls_Info.update({"URL":self.url},
                             {"$set": {'add_time':add_time,'title':title_url,
                                       'IP':ip_url,'Info':cms_url,'CDN':cdn_url}},
                             upsert = True)
        print self.url + '     end'


if __name__ == '__main__':
    #url = 'http://www.163.com'

    with open('test.txt') as f:
        u = f.readlines()
        for uu in u:
            url = uu.strip('\r').strip('\n')
            print url
            cdn = Url_Check(url)
            print cdn.check()

你可能感兴趣的:(python 网站CDN)