1,豆瓣电影----用的BeautifulSoup里的soup对象的select方法
import urllib2
import re
from bs4 import BeautifulSoup
html=urllib2.urlopen('http://movie.douban.com/top250?format=text').read()
soup=BeautifulSoup(html,'html.parser')
h1=soup.select('#content > h1')
print h1[0].text
name=soup.select('.article > ol > li > .item > .info > .hd > a > span:nth-of-type(1)')
comment=soup.select('.article > ol > li > .item > .info > .bd > .star > .rating_num')
p=soup.select('.article > ol > li > .item > .info > .bd > .star > span')
people=[]
link=soup.select('.article > ol > li > .item > .info > .hd > a')
for i in range(len(p)+1):
if i%4==0:
if i!=0:
people.append(p[i-1])
for i in range(len(name)):
str1 = unicode(name[i].text).encode('utf-8')
str2=unicode(people[i].text).encode('utf-8')
str3=unicode(link[i].get('href')).encode('utf-8')
print("%s\t%.1lf\t%s\t%s\n"%(str1,float(comment[i].text),str2,str3))
豆瓣电影TOP250
肖申克的救赎 9.6 833497人评价 https://movie.douban.com/subject/1292052/
这个杀手不太冷 9.4 799617人评价 https://movie.douban.com/subject/1295644/
霸王别姬 9.5 596009人评价 https://movie.douban.com/subject/1291546/
阿甘正传 9.4 684789人评价 https://movie.douban.com/subject/1292720/
美丽人生 9.5 398161人评价 https://movie.douban.com/subject/1292063/
千与千寻 9.2 636550人评价 https://movie.douban.com/subject/1291561/
辛德勒的名单 9.4 368296人评价 https://movie.douban.com/subject/1295124/
泰坦尼克号 9.2 630035人评价 https://movie.douban.com/subject/1292722/
盗梦空间 9.2 737747人评价 https://movie.douban.com/subject/3541415/
机器人总动员 9.3 484276人评价 https://movie.douban.com/subject/2131459/
海上钢琴师 9.2 582046人评价 https://movie.douban.com/subject/1292001/
三傻大闹宝莱坞 9.1 648663人评价 https://movie.douban.com/subject/3793023/
忠犬八公的故事 9.2 435451人评价 https://movie.douban.com/subject/3011091/
放牛班的春天 9.2 436182人评价 https://movie.douban.com/subject/1291549/
大话西游之大圣娶亲 9.2 469645人评价 https://movie.douban.com/subject/1292213/
教父 9.2 332416人评价 https://movie.douban.com/subject/1291841/
龙猫 9.1 404287人评价 https://movie.douban.com/subject/1291560/
楚门的世界 9.0 433695人评价 https://movie.douban.com/subject/1292064/
乱世佳人 9.2 263216人评价 https://movie.douban.com/subject/1300267/
天堂电影院 9.1 301028人评价 https://movie.douban.com/subject/1291828/
当幸福来敲门 8.9 522002人评价 https://movie.douban.com/subject/1849031/
触不可及 9.1 353854人评价 https://movie.douban.com/subject/6786002/
搏击俱乐部 9.0 394147人评价 https://movie.douban.com/subject/1292000/
十二怒汉 9.3 164634人评价 https://movie.douban.com/subject/1293182/
熔炉 9.2 235536人评价 https://movie.douban.com/subject/5912992/
,2,豆瓣书本
import urllib
import urllib2
import re
from bs4 import BeautifulSoup
url=["https://book.douban.com/top250","https://book.douban.com/top250?start=25","https://book.douban.com/top250?start=50"] #三页
for j in url:
h=urllib2.urlopen(j).read()
s=BeautifulSoup(h,'html.parser')
imgre=re.compile(r' td > a > img')
bname=s.select('.item > td:nth-of-type(2) > .pl2 > a')
people=s.select('.item > td:nth-of-type(2) > .pl')
rate=s.select('.item > td:nth-of-type(2) > div > .rating_nums')
comment=s.select('.item > td:nth-of-type(2) > div > span:nth-of-type(3)')
p1=[]
p2=[]
for i in range(len(people)):
str=unicode(people[i].text).encode('utf-8')
str1=str.split('/')[0]
str2=str.split('/')[1]
p1.append(str1)
p2.append(str2)
c=[]
for i in comment:
str=unicode(i.text).encode('utf-8')
str=str.strip("(")
str=str.strip(")")
c.append(str)
for i in range(len(bname)):
print unicode(bname[i].text).encode('utf-8')+p1[i]+" "+p2[i]+"\n"+unicode(rate[i].text).encode('utf-8')+c[i]
x=1
for i in imglist1:
images=urllib.urlretrieve(i,"D:\lj\image\ldc\%s.jpg"%x)
x+=1