sqlmap百度爬虫

bugs:

中文支持不好,没做特殊处理,仅是个人业余爱好

--------------------------------------------------------------------------

相信用过sqlmap的人都有自己一套找bug website的方法。

我也只是刚刚听身边朋友说这个利器,看他天天百度找url找的挺累了,

因此写了一个python脚本给他批量抓url,

然后再将抓到的url给sqlmap检测

工具:python 2.7(32位)、lxml-3.1.1.win32-py2.7、pyquery-1.2.13、requests-2.10.0


#!/usr/bin/python
#coding=GBK

import re
import requests
from pyquery import PyQuery as Pq

class BaiduSearchSpider(object):
    
    def __init__(self, searchText):
        self.url = "http://www.baidu.com/baidu?wd=%s&tn=monline_4_dg" % searchText
        self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17"}
        self._page = None

    def rePage(self,url):
        self.url = url
        self._page = None
    
    @property 
    def page(self):
        if not self._page:
            r = requests.get(self.url, headers=self.headers)
            r.encoding = 'utf-8'
            self._page = Pq(r.text)
        return self._page
    
    @property
    def baiduURLs(self):
        return [(site.attr('href'), site.text().encode('utf-8')) for site in self.page('div.result.c-container  h3.t  a').items()]
    @property
    def nextPageUrl(self):
        return [(site.attr('href'), site.text().encode('utf-8')) for site in self.page('div#page a.n').items()]
    
    @property    
    def originalURLs(self):
        tmpURLs = self.baiduURLs
        #print tmpURLs
        originalURLs = []
        for tmpurl in tmpURLs:
            tmpPage = requests.get(tmpurl[0], allow_redirects=False)
            if tmpPage.status_code == 200:
                urlMatch = re.search(r'URL=\'(.*?)\'', tmpPage.text.encode('utf-8'), re.S)
                #originalURLs.append((urlMatch.group(1), tmpurl[1]))
                originalURLs.append((urlMatch.group(1)))
            elif tmpPage.status_code == 302:
                #originalURLs.append((tmpPage.headers.get('location'), tmpurl[1]))
                originalURLs.append((tmpPage.headers.get('location')))
            else:
                print 'No URL found!!'
 
        return originalURLs

searchText = raw_input("搜索内容是:") 
print searchText

bdsearch = BaiduSearchSpider(searchText) 
count = 0
while (count < 100):
    originalurls = bdsearch.originalURLs
    #print originalurls
    for urlStr in originalurls:
        f = open('recode.txt','a')
        f.write(urlStr+'\n')
        f.close()
    pagesUrl = bdsearch.nextPageUrl
    nextUrl = ''
    if (len(pagesUrl) == 2) :     
        nextUrl = "http://www.baidu.com"+pagesUrl[1][0]
    elif(count==0):
        nextUrl = "http://www.baidu.com"+pagesUrl[0][0]
    else:
        print "search end"
        exit()       
    #print nextUrl
    bdsearch.rePage(nextUrl)
    count = count + 1
    print "count = "+str(count)

C:\Users\Administrator\Desktop\python\httptest\httptest>python getUrl.py
搜索内容是:inurl:asp?id=
inurl:asp?id=
count = 1
count = 2
count = 3
count = 4
count = 5
count = 6
count = 7
count = 8
count = 9
count = 10
count = 11
count = 12
count = 13
count = 14
count = 15
count = 16

在生成的recode.txt中保存这搜索结果

http://www.kemflo.net/news.php?id=45
http://www.lxjx.cn/news.php?id=259
http://www.hnccgc.com/jcxx/get_news.php?id=10069
http://www.southsurvey.com/public/news.php?id=1120
http://www.7daysinn.cn/news.php?id=2421
http://www.hzfc.gov.cn/zwgk/zwgknews.php?id=214690
http://www.hwqh.com.cn/viewnews.php?id=43923
http://www.neweekly.com.cn/newsview.php?id=2905
http://www.hnccgc.com/jcxx/get_news.php?id=11648
http://xwzx.cqupt.edu.cn/xwzx.?news.php?id=26533
http://www.hnccgc.com/xwzx/get_news.php?id=13015
http://www.sxsfgl.gov.cn/news.php?id=1281&root_lanmu=52
http://www.ccmt.org.cn/shownews.php?id=15031
http://www.hnccgc.com/jcxx/get_news.php?id=13353
http://www.bjmtgnews.com/paper/news.php?id=5909
http://www.ltzxw.com/news.php?id=3767
http://www.f0580.com/news/news.php?id=5284
http://www.cwhweb.com/news.php?id=3488
http://www.ks-lxjy.com/news/news.php?id=7066
http://www.chinawalking.net.cn/newsite/readnews.php?id=2298
http://www.oebrand.cn/news.php?id=12080
http://www.boosoochina.com/news/shownews.php?id=1496&lang=cn
http://www.badmintoncn.com/news.php?id=18300
http://www.dcfever.com/news/readnews.php?id=8150
http://www.stat-nba.com/news.php?id=6
http://tuan.zjcheshi.com/news.php?id=65991
http://www.fwol.cn/shownews.php?id=25087


你可能感兴趣的:(python,爬虫,脚本)