实践中遇到的问题

scrapy crawl liepin 遇到的坑

1.处理json文件方法,try 用法

# 创建json文件对象
self.f = open('city_list.json', 'a', encoding='utf-8')
item = {}
for c in c_list:
    #生成字典
    city = c.xpath('./@title')[0]
    item[city] = c.xpath('./@href')[0]
   # 存入json文件
 json.dump(item, self.f, ensure_ascii=False)
    
    # close
    self.f.close()

读取:

def select_city(self, city):
    with open('city_list.json', 'r', encoding='utf-8') as f:
        # 读json文件
        f_dict = json.load(f)
        # 尝试返回城市代码,如果输入城市不存在,报错
        try:
            return f_dict[city]
        except:
            print('输入城市不在查询范围内,请重新输入!')
           
  1. 获取页码总数
    def get_page(self, city_link, key):
        url = self.base_url.format(city_link, '0', key)
        # requests.get 的用法
        res = requests.get(url=url, headers=self.headers).text # 返回字符串
        # em 节点无法用xpath获取,只能用正则, re.findall(表达式,文本)
        num = int(re.findall(r'(.*)', res)[0][:-1])# findall结果是列表:['1000+']
        page = num // 40
        return page
    # 如果是requests.get(url,headers).text 接xpath:
    res.encoding = 'utf-8'
     parse_html = etree.HTML(res)
        # r_list : ['../','day01','day02','redis_day01.zip']
        r_list = parse_html.xpath(xpath_bds)     
  1. 解析部分:

模板中,直接response.xpath即可

你可能感兴趣的:(实践中遇到的问题)