from bs4 import BeautifulSoup
path='./1_2_homework_required/index.html'
with open(path,'r')as f:
soup=BeautifulSoup(f.read(),'lxml')
images=soup.select('div > div > div > div > div > div > img')
titles=soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4 > a')
prices=soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4.pull-right')
words=soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > p')
stars=soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p > span')
starslast=[]
while len(stars)!=0:
fenzu=[]
for i in range(0,5):
if 'glyphicon glyphicon-star-empty' in str(stars[i]):
fenzu.insert( 1, '☆')
else:
fenzu.insert(1, '★')
del stars[0:5]
starslast.append(fenzu)
print(starslast)
for image,title,price,word,star in zip(images,titles,prices,words,starslast):
info={
'image':image.get('src'),
'title': title.get_text(),
'price': price.get_text(),
'word': word.get_text(),
'star': star
}
print(info)
总结:
(1)beautifulsoup 不支持nth-child,去掉css中的nth-child语法
(2)图片链接放在了标签的src属性里面,我们可以利用get()函数得到属性文本里面的内容image_content=image.get(src")