Python爬取豆瓣新片排行榜

菜鸟初长成,爬虫路漫漫

from bs4 import BeautifulSoup
import urllib
import os
##定义了一个豆瓣电影类
class doubanmovies:
    def __init__(self,name,link,dec):
        self.name=name
        self.link=link
        self.dec=dec
    def save(self,file):
        file.write(self.name+'\n'+self.link+'\n'+self.dec+'\n\n')
    def putout(self):
        print(self.name+' '+self.link+' '+self.dec)
#网站url
url='https://movie.douban.com/chart'
urllib.request.urlretrieve(url,'movie.html')
movies=[]
with open('movie.html','rb') as myfile:
    data=myfile.read().decode('utf-8')
if data:
    parsed_html=BeautifulSoup(data,'html.parser')   #find_all有技巧,返回的是一个列表,注意网站的结构
    tables=parsed_html.find_all('table',{'class':'','width':'100%'})
    for table in tables:
        #print(table)
        items=table.find_all('td',{'valign':'top'})
        #print(items)
        for item in items:   #注意网站结构,是个坑
            item1=item.find_all('a',{'class':''})
            for item11 in item1:
                name=item11.get_text()
                name2=''
                for name1 in name:
                    name2=name2+name1.strip()
                print(name2)
                movieurl=item11.get("href")
                print(movieurl)
            item2=item.find_all('p')  
            for item22 in item2:
                desc=item22.get_text()
                print(desc)
        movie=doubanmovies(name2,movieurl,desc)
        movies.append(movie)

with open('douban.txt','w+') as f:
    for movie in movies:
        movie.save(f)

你可能感兴趣的:(python爬虫)