Python 1-2

注意事项

  • 正则表达式
  • 获取列表长度
  • 字符集问题
from bs4 import BeautifulSoup
import re

html_file = '/Users/XXX/muggle/Plan-for-combating/week1/1_2/1_2answer_of_homework/index.html'

# 使用with open语法打开文件
# 第一个参数是文件地址;第二个参数是文件处理方式:r表示读取文件;w表示写文件
# 添加encoding指定字符集,避免乱码问题
with open(html_file,'r',encoding='utf-8') as web_data:
    content = web_data.read()
    soup = BeautifulSoup(content, 'lxml')

    titles = soup.select("body > div > div > div.col-md-9 > div > div > div > div.caption > h4 > a")
    images = soup.select("body > div > div > div.col-md-9 > div > div > div > img")
    reviews = soup.select("body > div > div > div.col-md-9 > div > div > div > div.ratings > p.pull-right")
    prices = soup.select("body > div > div > div.col-md-9 > div > div > div > div.caption > h4.pull-right")
    stars = soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p:nth-of-type(2)')

for title, image, price, star, review in zip(titles, images, prices, stars, reviews):
    data = {
        'title': title.get_text(),
        'image': image.get('src'),
        'price': price.get_text(),
        # 通过len函数获取列表长度
        'star' : len(star.find_all('span', class_="glyphicon glyphicon-star")),
        # 通知正则表达式获取数字
        'review': int(re.search(r'\d*', review.get_text()).group())
    }
    print(data)

你可能感兴趣的:(Python 1-2)