python爬虫实战-bs4爬取2345电影

抓取的原理也比较简单,不过多解释了,代码注释的也比较清楚

参考: Python网络爬虫实战(第二版)

# -*- coding: utf-8 -*-
"""
Created on Thu Apr 16 14:20:20 2020

@author: hja
"""

from bs4 import BeautifulSoup
import urllib.request
import codecs
from mylog import MyLog as mylog
import sys
import re


class MovieItem(object):
    movieName = None
    movieScore = None
    movieStarring = None

class GetMovie(object):
    #'''获取电影信息'''
    def __init__(self):
        self.urlBase = r'http://dianying.2345.com/list/----2019---1.html'
        self.log = mylog()
        self.pages = self.getPages()
        self.urls = []#url池
        self.items = []
        self.getUrls(self.pages) #获取抓取页面的url
        self.spider(self.urls)
        self.pipelines(self.items)
        
    def getPages(self):
        '''获取总页数'''
        self.log.info('开始获取页面')
        htmlContent = self.getResponseContent(self.urlBase)
        soup = BeautifulSoup(htmlContent,'lxml')
        tag = soup.find('div',attrs = {'class':'v_page'})
        subTags = tag.find_all('a',attrs = {'target':'_self'})
        self.log.info('获取网页成功')
        
        return int(subTags[-2].get_text())
    
    def getResponseContent(self,url):
        '''获取页面返回的数据'''
        fakeHeaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
        request = urllib.request.Request(url,headers = fakeHeaders)
        
        try:
            response = urllib.request.urlopen(request)
        
        except:
            self.log.error('Python返回url : %s 数据失败' %url)
        
        else:
            self.log.info('Python 返回URUL :%s 数据成功' %url)
            return response.read().decode('GBK')
        
    def getUrls(self,pages):
        urlHead = 'http://dianying.2345.com/list/----2019---'
        urlEnd = '.html'
        for i in range(1,pages + 1):#pages + 1
            url = urlHead + str(i) + urlEnd
            self.urls.append(url)
            self.log.info('添加URL:%s 到 URLS列表' %url)
        
    
    def spider(self,urls):
        for url in urls:
            htmlContent = self.getResponseContent(url)
            soup = BeautifulSoup(htmlContent,'lxml')
            #查看网页源码,电影名字,分数主演,都在ul 后面,ul后面的class = v_picTxt pic180_240 clearfix
            anchorTag = soup.find('ul',attrs = {'class':'v_picTxt pic180_240 clearfix'})
            #li media=204546 
            tags = anchorTag.find_all('li',attrs = {'media':re.compile('\d{5}')})
            
            for tag in tags:
                item = MovieItem()
                #span class=sTit
                item.movieName = tag.find('span',attrs = {'class':'sTit'}).get_text()
                #span class=pRightBottomem9.3分emspan
                item.movieScore = tag.find('span',attrs = {'class':'pRightBottom'}).em.get_text().replace('分','')
                #span class=sDes主演:
                item.movieStarring = tag.find('span',attrs = {'class':'sDes'}).get_text().replace('主演','')
                self.items.append(item)
                self.log.info('获取电影名字为: <<%s>> 成功' %(item.movieName))
                
            
    
    def pipelines(self,items):
        fileName = '2019热门电影.txt'
        with codecs.open(fileName,'w','utf-8') as fp:
            for item in items:
#                fp.write('%s \t %s \t %s \r\n' %(item.movieName,item.movieScore,item.movieStarring))
                #  排版
                mess = item.movieName + item.movieStarring + item.movieScore
                new_mess = "".join((re.sub("\n", " ", mess)).split(" "))
                fp.write('%s\n \n'%(new_mess))
                self.log.info('电影名字为 :<<%s>>已成功存入文件"%s"...' %(item.movieName,fileName))
                

if __name__ == '__main__':
    GM = GetMovie()

我稍稍修改了一下打印在TXT的排版,效果如下
python爬虫实战-bs4爬取2345电影_第1张图片

你可能感兴趣的:(Python学习,python,url)