人工智能学习笔记:Python爬虫开发

一、爬虫介绍与常用工具

  1. 第一个爬虫

    from urllib.request import urlopen
    url = 'http://www.baidu.com'
    response = urlopen(url)
    print(response.read().decode())
    print(response.getcode())  		#返回状态码
    print(response.geturl())		#实际访问的url
    print(response.info())			#http响应头
    
  2. get请求

    from urllib.request import urlopen,Request
    from urllib.parse import quote
    from urllib.parse import urlencode
    args = {
        'wd':"尚学堂",
        'ie':'utf-8'
    }
    print(urlencode(args)) 		 #wd=%E5%B0%9A%E5%AD%A6%E5%A0%82&ie=utf-8
    # url = 'https://www.baidu.com/s?ie=UTF-8&wd={}'.format(quote("尚学堂"))
    url = 'https://www.baidu.com/s?ie=UTF-8&wd={}'.format(urlencode(args))
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
    }
    req = Request(url,headers=headers)
    print(urlopen(req).read().decode())
    
  3. post请求:

    from urllib.request import urlopen,Request
    from urllib.parse import urlencode
    url = 'https://www.baidu.com/'
    args = {
        'user':'111111',
        'password':'123456'
    }
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
    }
    req = Request(url,headers=headers,data=urlencode(args))
    print(urlopen(req).read().decode())
    
  4. https请求问题

    import ssl
    context = ssl._create_unverified_context() #忽略ssl安全认证
    print(urlopen(url,context=context).read().decode())
    

二、爬虫常用开发模块

  1. 动态UserAgent的使用

    from fake_useragent import UserAgent
    ua = UserAgent()
    print(ua.ie)
    print(ua.chrome)
    print(ua.random)
    
  2. opener的使用

    from urllib.request import urlopen,Request
    url = "http://httpbin.org/get"
    from fake_useragent import UserAgent
    headers = {
        "User-Agent": UserAgent().random
    }
    req = Request(url,headers=headers)
    from urllib.request import  build_opener,HTTPHandler
    handler = HTTPHandler(debuglevel=1) 		#打印信息
    opener = build_opener(handler)
    resp = opener.open(req)
    #print(resp.read().decode())
    
  3. proxy代理的使用

    from urllib.request import  build_opener,ProxyHandler
    # handler = ProxyHandler({"http":"name:password@ip:port"})
    handler = ProxyHandler({"http":"211.137.52.158:8080"})
    opener = build_opener(handler)
    
  4. cookie使用

    from urllib.request import HTTPCookieProcessor
    handler = HTTPCookieProcessor()		#可以保存cookie
    opener = build_opener(handler)
    
  5. cookie的保存与使用

    #cookie的保存
    from http.cookiejar import MozillaCookieJar
    cookie_jar = MozillaCookieJar()
    handler = HTTPCookieProcessor(cookie_jar)
    opener = build_opener(handler)
    resp = opener.open(req)
    cookie_jar.set_cookie('cookie.txt', ignore_discard=True, ignore_expires=True)
    #cookie的使用
    from http.cookiejar import MozillaCookieJar
    cookie_jar = MozillaCookieJar()
    cookie_jar.load('cookie.txt', ignore_discard=True, ignore_expires=True)
    handler = HTTPCookieProcessor(cookie_jar)
    opener = build_opener(handler)
    resp = opener.open(req)
    
  6. 捕获异常URLError

    from urllib.error import URLError
    
  7. requests的使用

    • get请求

      import requests
      url = "http://httpbin.org/get"
      proxy = {									
          "http":"http://211.137.52.158:8080"		#设置代理
      }
      headers = {"User-Agent":UserAgent().random}
      resp = requests.get(url,headers=headers,proxies=proxy)
      print(resp.url)
      resp.encoding = 'utf-8'
      print(resp.text)
      
    • session自动保存cookies

      s = requests.Session() 
      # 用session对象发出get请求,设置cookies 
      s.get('http://httpbin.org/cookies/set/sessioncookie/123456789') 
      
    • ssl验证

      # 禁用安全请求警告
      requests.packages.urllib3.disable_warnings()
      resp = requests.get(url, verify=False, headers=headers)
      

三、数据提取与验证码识别

  1. 正则表达式的使用:https://blog.csdn.net/mingzme/article/details/107250157

    f1 = re.match(r"/w",str)
    s1 = re.sub(r"every_day","EveryDay",str)	#t替换
    
  2. 数据提取-Beautiful Soup:https://blog.csdn.net/mingzme/article/details/107250908

    from bs4 import BeautifulSoup
    soup = BeautifulSoup(str, 'lxml')
    print(soup.title)
    a = soup.select('css表达式')[0].text		#css表达式
    a.get('href')							  #获得元素的属性		
    
  3. 数据提取-XPath:https://blog.csdn.net/mingzme/article/details/107252400

    from lxml import etree
    url='https://www.qidian.com/rank/fengyun?style=1&year=2018&month=08'
    headers = {"User-Agent":UserAgent().chrome}
    resp = requests.get(url,headers=headers)
    e = etree.HTML(resp.text)
    names = e.xpath('//div[@class="book-mid-info"]/h4/a/text()')
    authors = e.xpath('//p[@class="author"]/a[1]/text()')
    for name,author in zip(names,authors):
        print(name +":"+ author)
    
  4. 数据提取-PyQuery:https://blog.csdn.net/mingzme/article/details/107255479

    from pyquery import PyQuery
    doc = PyQuery(resp.text)
    names = [a.text for a in doc('h4 a')]	()中是css表达式
    print(names)
    
  5. 数据提取-jsonpath:https://blog.csdn.net/mingzme/article/details/107299928

    • json

      str = '{"name":"盗梦空间"}'
      obj = json.loads(str)       							#字符串转字典对象
      obj_str = json.dumps(obj,ensure_ascii=False)    		#字典对象转字符串
      #对象保存到文件
      json.dump(obj,open('movie.txt','w',encoding='utf-8'),ensure_ascii=False)
      obj2 = json.load(open('movie.txt',encoding='utf-8'))	#文件转对象
      
    • jsonpath使用

      from  jsonpath import jsonpath
      names = jsonpath(json.loads(resp.text), '$..name')
      ids = jsonpath(resp.json(),"$..id")
      
  6. Tesseract识别文字(需要安装)

    import pytesseract
    from PIL import Image
    img = Image.open('yzm1.jpg')
    str = pytesseract.image_to_string(img)
    print(str)
    
  7. selenium与PhantomJS游览器自动化插件:https://blog.csdn.net/mingzme/article/details/107303299

    from selenium import webdriver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')          #开启无头浏览器模式
    options.add_argument('--proxy-server=http://ip:port')	#设置代理
    chrome = webdriver.Chrome(chrome_options=options)
    chrome.get("https://cn.bing.com/")
    chrome.find_element_by_id('sb_form_q').send_keys('python')	#输入框输入需要查询内容
    chrome.find_element_by_id('sb_form_go').click()				#点击查询按钮
    chrome.save_screenshot('baidu.png') #截图
    js = 'document.documentElements.scrollTop=1000000'
    chrome.execute_script(js)			#拉滚动条
    html = chrome.page_source   #获取源代码
    chrome.quit()       #关闭浏览器
    

四、Scrapy框架简介与配置

  1. 创建项目

    scrapy startproject myfrist
    
  2. 创建爬虫

    scrapy genspider 爬虫名 爬虫的地址
    
  3. 运行爬虫

    scrapy crawl 爬虫名
    scrapy crawl 爬虫名 -o douban.json -t json  
    #方法二
    from scrapy.cmdline import execute
    execute('scrapy crawl movie'.split())
    
  4. 案例

    #movie.py
        def parse(self, response):
            names = response.xpath('//div[@class="hd"]/a/span[1]/text()').extract()
            stars = response.xpath('//span[@class="rating_num"]/text()').extract()
            item = DoubanItem()
            for name, star in zip(names, stars):
                item['name'] = name
                item['star'] = star
                yield item'
                
    #items.py
    class DoubanItem(scrapy.Item):
        # define the fields for your item here like:
        name = scrapy.Field()
        star = scrapy.Field()
        
    #pipelines.py
    from json import dumps
    class DoubanPipeline:
        def open_spider(self,spider):		#爬虫开始时运行
            self.filename = open('movies.txt','w',encoding='utf-8')
    
        def process_item(self, item, spider):
            self.filename.write(dumps(dict(item),ensure_ascii=False)+"\n")
            return item
    
        def close_spider(self,spider):		#爬虫结束时运行
            self.filename.close()
    
  5. settings.py的设置内容:https://blog.csdn.net/mingzme/article/details/107322777

  6. 案例

    #qu.py
    class QuSpider(scrapy.Spider):
        name = 'qu'
        allowed_domains = ['qu.la']
        start_urls = ['https://www.qu.la/book/4703/2014176.html']
    
        def parse(self, response):
            title = response.xpath('//h1/text()').extract_first()
            content = response.xpath('string(//div[@class="content"])').extract_first().strip().replace('        ','\n')
            next_url = response.xpath('//div[@class="section-opt"]/a[3]/@href').extract_first()
    
            yield{
                'title':title,
                'content':content
            }
            #继续爬取下一个url
            yield scrapy.Request(response.urljoin(next_url), callback=self.parse)
            
    #pipelines.py
    class FictionPipeline:
        def open_spider(self,spider):
            self.filename = open('fiction.txt','w',encoding='utf-8')
    
        def process_item(self, item, spider):
            info = item['title'] + '\n' + item['content'] + "\n"
            self.filename.write(info+'\n\n\n')
            self.filename.flush()
            return item
    
        def close_spider(self,spider):
            self.filename.close()
            
    

五、scrapy框架高级

  1. crawlspider的使用

    • 创建爬虫

      scrapy genspider qu3 qu.la -t crawl
      
    • 案例

      class Qu3Spider(CrawlSpider):
          name = 'qu3'
          allowed_domains = ['qu.la']
          start_urls = ['https://www.qu.la/book/4703/']
      
          rules = (
              Rule(LinkExtractor(restrict_xpaths=r'//*[@id="list"]/dl/dd[13]/a'), callback='parse_item', follow=True),#爬取第一章
              Rule(LinkExtractor(restrict_xpaths=r'//div[@class="section-opt"]/a[3]'), callback='parse_item', follow=True),
          )						
      
          def parse_item(self, response):
              title = response.xpath('//h1/text()').extract_first()
              content = response.xpath('string(//div[@class="content"])').extract_first().strip().replace('        ', '\n')
              yield {
                  'title': title,
                  'content': content
              }
      
  2. imagepipline 下载图片

    #zol.py
        def parse(self, response):
            image_url = response.xpath('//img[@id="bigImg"]/@src').extract_first()
            image_name = response.xpath('string(//h3)').extract_first()
            yield {
                'image_urls' : [image_url],		#不重写方法的话名字固定
                'image_name' : image_name
            }
            next_url = response.xpath('//a[@id="pageNext"]/@href').extract_first()
            yield scrapy.Request(response.urljoin(next_url),callback=self.parse)
            
    #pipelines.py
    from scrapy.pipelines.images import ImagesPipeline
    from scrapy import Request
    class ImagePipeline(ImagesPipeline):		#继承ImagesPipeline
    
        def get_media_requests(self, item, info):
            return Request(item['image_urls'], meta={'name' : item['image_name']})
    
        def file_path(self, request, response=None, info=None):		#改变图片名字
            name = request.meta['name'].strip().replace('\r\n\t\t','')
            name = name.replace('/','-')
            return name+'.jpg'
        
     #settings.py
    ITEM_PIPELINES = {
       'scrapy.pipelines.images.ImagesPipeline': 300,
       'image.pipelines.ImagePipeline' : 300,			#重写方法的加进去
    }
    IMAGES_STORE='C:/Users/Mingz/Desktop/PythonLab/imagee'
    
  3. 中间件:动态UA设置

    #middlewares.py
    #from image.settings import USER_AGENTS
    #from random import choice
    class UserAgentMiddlerware:
    	def process_request(self, request, spider):
    		request.headers.setdefault(b'User-Agent',UserAgent().random)
    #setting.py
    DOWNLOADER_MIDDLEWARES = {
       'image.middlewares.ImageDownloaderMiddleware': 343,	#调高优先级
    }
    USER_AGENT=[
        'a','b','c'
    ]
    
  4. 中间件:动态代理

    #middlewares.py
    class ProxyMiddlerware:
    	def process_request(self,request, spider):
    		request.meta['proxy'] = 'http://uname:password@ip:port'
    #setting.py
    DOWNLOADER_MIDDLEWARES = {
       'image.middlewares.ProxyMiddlerware':344
    }
    
  5. 登陆表单

    class FilterSpider(scrapy.Spider):
        name = 'filter'
        allowed_domains = ['baicu.com']
        
        def start_requests(self):
            url = 'https://www.baidu.com'
            form_data= {
                'user':'user',
                'password':'pwd'
            }
            for num in range(3):
                yield scrapy.FormRequest(url, callback=self.parse, formdata=form_data, dont_filter=True,cookie="..")#True 不再去重 传表单 cookie要传入字典对象
    

六、爬虫数据存储

  1. mongodb数据库使用

    show dbs
    db.createCollection('student')
    db.dropDatabase()
    show tables
    show collections	#和show tables 一样
    db.student.drop()
    
  2. crud操作

    db.student.save([{name:"刘备"},{name:"董卓"}])		#id重复则覆盖
    db.student.insert({name:"刘备"})						#id重复则报错
    db.student.update({name:"刘备"},{age:33,name:"刘备"})	
    db.student.update({name:"刘备"},{$set:{age:18}},{multi:true})	#更新多条
    db.student.remove({name:"刘备"},{justOne:true})
    db.student.remove({})			#删除所有数据
    db.student.find().limit(3).skip(6).sort({age:1})	#1升序 -1降序
    db.student.find({country:"魏国"}).count()
    db.student.find({$or:[{age:{$lt:25}},{country:'魏国'}]})       #小于25的或者...
    db.student.find({age:{$in:[25,28]}})
    db.student.find({name:/^曹/})			#模糊匹配 姓曹的人
    db.student.find({name:{$regex:"^曹"}})
    db.student.find({$where:function(){return this.age>=23}})	#自定义查询
    db.student.distinct('country')			#去重
    db.student.find({'age':{$exists:true}})
    
  3. Mongo与Python的交互

    from pymongo import MongoClient
    client = MongoClient()
    school = client.school  #获取数据库实例
    student = school.student    #获取集合
    stus = student.find()
    print(stus.next())
    stu = student.find_one({"country":"蜀国"})
    stus = student.find().skip(6).limit(6)
    # stus = student.find().sort("age",pymongo.DESCENDING)
    stu = {"name":"诸葛亮","country":"蜀国"}
    student.insert_one(stu)
    student.update_one({"name":"诸葛亮"},{"$set":{"age":30}})
    student.delete_many({"name":"诸葛亮"})
    
  4. 爬取数据保存到数据库

    #Mongo数据库
    from pymongo import MongoClient
    
    class MongoDemoPipeline:
        def open_spider(self,spider):
            self.client = MongoClient()
            self.db = self.client.movie
            self.collection = self.db.collection
    
        def process_item(self, item, spider):
            self.collection.insert(item)
            return item
    
        def close_spider(self,spider):
            self.client.close()
            
     #Mysql 数据库
     class MysqlPipeline:
        def open_spider(self,spider):
            self.client = connect(host='localhost', port=3306, user='root', password='root', db='test01')
            self.cursor = self.client.cursor()
    
        def process_item(self, item, spider):
            sql = 'insert into t_maoyan values(0,%s,%s)'
            self.cursor.execute(sql, [item['name'],item['star']])
            self.client.commit()
            return item
    
        def close_spider(self,spider):
            self.cursor.close()
            self.client.close()
     #setting.py
       ITEM_PIPELINES = {
       'mongo_demo.pipelines.MongoDemoPipeline': 300,
       'mongo_demo.pipelines.MysqlPipeline': 301,
    }
    

七、动态数据抓取

  1. Splash与python:https://blog.csdn.net/mingzme/article/details/107339895

    url = 'https://www.guazi.com/hengshui/buy/'
    base_url = 'http://192.168.99.100:8050/render.html?url={}&wait=2'.format(url)
    resp = requests.get(base_url, headers={'User-Agent': UserAgent().chrome})
    
    import requests
    from urllib.parse import quote
    from fake_useragent import UserAgent
    
    url = 'https://www.guazi.com/hengshui/buy/'
    lua_script = '''
    function main(splash, args)
      assert(splash:go('{}'))
      assert(splash:wait(0.5))
      return splash:html()
    end
    '''.format(url)
    base_url = 'http://192.168.99.100:8050/execute?lua_source=' + quote(lua_script)
    resp = requests.get(base_url, headers={'User-Agent': UserAgent().chrome})
    
  2. splash与scrapy

    #settings.py
    SPLASH_URL = 'http://192.168.99.100:8050/'
    DOWNLOADER_MIDDLEWARES = {
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
    }
    SPIDER_MIDDLEWARES = {
      'scrapy_splash.SplashDeduplicateArgsMiddleware': 100
    }
    DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
    HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
    
    #guaizi.py
    from scrapy_splash import SplashRequest
    class Guazi1Spider(scrapy.Spider):
        name = 'guazi1'
        allowed_domains = ['guazi.com']
        
        def start_requests(self):
            url = 'https://www.guazi.com/bj/buy'
            yield SplashRequest(url, callback=self.parse,args={'wait':2})
        def parse(self, response):
            print(response.text)
            
    #guaizi2.py
        def start_requests(self):
            url = 'https://www.guazi.com/hengshui/buy/'
            lua_script = '''
            function main(splash, args)
              assert(splash:go(args.url))
              assert(splash:wait(0.5))
              return splash:html()
            end
            '''
            yield SplashRequest(url, callback=self.parse, endpoint='execute',args={'lua_source':lua_script})
    
  3. selenium 与 scrapy的结合

    #baidu.py
    import scrapy
    from selenium import webdriver
    from scrapy import signals
    class BaiduSpider(scrapy.Spider):
        name = 'baidu'
        allowed_domains = ['baidu.com']
        start_urls = ['http://www.baidu.com/']
    
        @classmethod
        def from_crawler(cls, crawler, *args, **kwargs):
            spider = super(BaiduSpider, cls).from_crawler(crawler, *args, **kwargs) #初始化爬虫对象
            spider.driver = webdriver.Chrome()
            crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)#捕捉信号
            return spider
    
        def spider_closed(self, spider):
            spider.driver.close()
    
        def parse(self, response):
            print(response.text)
    middlewares.py
    from scrapy.http import HtmlResponse
    class SeleniumMiddleware:
        def process_request(self, request, spider):
            spider.driver.get(request.url)
            html = spider.driver.page_source
            return HtmlResponse(url=request.url,body=html,request=request,encoding='utf-8')       #不会再走后面和下载器
    #setting.py
    DOWNLOADER_MIDDLEWARES = {
       'selenium_demo.middlewares.SeleniumMiddleware': 543,
    }
    

你可能感兴趣的:(人工智能,python,爬虫,搜索引擎,java爬虫程序,爬虫搜索,关键字搜索,数据抓取,爬虫,jsoup,数据挖掘,自然语言处理)