python爬虫乱码(ISO-8859-1)

参考:https://blog.csdn.net/qq_36278071/article/details/79660196

爬一个网站时出现了乱码,然后就百度了解决方案。便找到了上面那篇文章。
代码原本是用scrapy写的,没有乱码。因为scrapy爬几条数据有些大材小用,我就将其单独写了出来。
乱码的原因是因为编码不对。先将控制台输出格式改一下。
乱码情况如下

{'d_ws': '8å°', 'd_numbers': '05,23,27,24,02,34', 'd_dx': 0, 'd_number_pro': '08', 'd_number': 115, 'd_period': '005æ\x9c', 'd_ds': 0, 'd_days': '2018-01-20', 'd_tt': '0å¤', 'd_ys': 0}
{'d_ws': '4å°', 'd_numbers': '27,37,02,36,21,30', 'd_dx': 0, 'd_number_pro': '04', 'd_number': 153, 'd_period': '004æ\x9c', 'd_ds': 0, 'd_days': '2018-01-16', 'd_tt': '0å¤', 'd_ys': 0}
{'d_ws': '9å°', 'd_numbers': '47,49,31,29,03,43', 'd_dx': 0, 'd_number_pro': '19', 'd_number': 202, 'd_period': '003æ\x9c', 'd_ds': 0, 'd_days': '2018-01-11', 'd_tt': '1å¤', 'd_ys': 0}
{'d_ws': '3å°', 'd_numbers': '01,36,31,49,12,21', 'd_dx': 0, 'd_number_pro': '23', 'd_number': 150, 'd_period': '002æ\x9c', 'd_ds': 0, 'd_days': '2018-01-06', 'd_tt': '2å¤', 'd_ys': 0}
{'d_ws': '6å°', 'd_numbers': '28,42,18,39,12,35', 'd_dx': 0, 'd_number_pro': '36', 'd_number': 174, 'd_period': '001æ\x9c', 'd_ds': 0, 'd_days': '2018-01-02', 'd_tt': '3å¤', 'd_ys': 0}

将编码转换一下就能正常运行了

    html = requests.get(url=start_urls, headers=headers).text
    html = html.encode("ISO-8859-1")
    html = html.decode("utf-8")
    response = etree.HTML(html)

我将全部代码也添上了,更加便于理解

import requests
from lxml import etree
from functools import reduce


def get_new_data():
    start_urls = 'http://www.kj803.com/History/2018.html'
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"
    }
    html = requests.get(url=start_urls, headers=headers).text
    html = html.encode("ISO-8859-1")
    html = html.decode("utf-8")
    response = etree.HTML(html)
    periods = response.xpath('//tr[@class="infolist"]')
    for i in periods:
        item = {}
        item["d_days"] = i.xpath("./td[1]/text()")[0]
        item["d_period"] = i.xpath("./td[2]/text()")[0].strip()[:-1]
        numbers = i.xpath("./td[3]/div/div/div[contains(@class,'hm')]/text()")
        number_pre = i.xpath("./td[4]/div/div[1]/text()")[0]
        item["d_number_pro"] = number_pre
        item["d_numbers"] = ",".join(numbers)
        item["d_number"] = reduce(lambda x, y: x + y, map(int, numbers))

        ds = i.xpath("./td[6]/text()")[0].strip()
        ys = i.xpath("./td[7]/text()")[0]
        dx = i.xpath("./td[8]/text()")[0]
        tt = i.xpath("./td[10]/text()")[0]
        ws = i.xpath("./td[11]/text()")[0]

        item["d_ds"] = 5 if ds == "单" else 15 if ds == "双" else 0
        item["d_ys"] = 5 if ys == "红" else 10 if ys == "绿" else 15 if ys == "蓝" else 0
        item["d_dx"] = 5 if dx == "小" else 15 if dx == "大" else 0
        item["d_tt"] = tt[:-1]
        item["d_ws"] = ws[:-1]
        yield item


if __name__ == "__main__":
    a = get_new_data()
    for i in a:
        print(i)

你可能感兴趣的:(python学习)