解析网页(2)

from bs4 import BeautifulSoup
with open('C:/Users/Administrator/Desktop/python/Plan-for-combating-master/week1/1_2/1_2code_of_video/web/new_index.html','r') as web:    soup = BeautifulSoup(web,'lxml')
    titles = soup.select('body > div.main-content > ul > li > div.article-info > h3 > a')#抓取网页中的title 项
    cates = soup.select('body > div.main-content > ul > li > div.article-info > p.meta-info')#抓取网页中的项,因为cate是多对一,所以去掉span标签
    descs = soup.select('body > div.main-content > ul > li > div.article-info > p.description')#抓取descs项目
    rates= soup.select('body > div.main-content > ul > li > div.rate > span')#抓取rate项
    images= soup.select('body > div.main-content > ul > li > img')#抓取图片
# print(titles,cates,descs,rates,images,sep='\n==================\n')
info = []
for title,cate,desc,rate,img in zip(titles,cates,descs,rates,images):
    data = {             'title':title.get_text(),#用方法提取title标签中的文本
                         'cate':list(cate.stripped_strings),#用stripped_strings方法提取多个cate,并加入list列表中,将列表作为cate的键值
                         'desc':desc.get_text(), 
                         'rate':rate.get_text(), 
                         'image':img.get('src'),#用get方法提取image标签中的src属性。
    } 
   info.append(data)#每一次循环是,把字典data的值插入到info列表中
    # print(info)
for i in info:#利用for循环找出rate大于3的info值
    if float(i['rate']) > 3: 
        print(i['title'],i['rate'],i['cate'],i['image'],i['desc'])                  #在zip字典下获取文本,如:title.get_text()           title必须跟html文件一样#soup.select('')中,必须在括号中加单引号

你可能感兴趣的:(解析网页(2))