参考:https://blog.csdn.net/qq_36278071/article/details/79660196
爬一个网站时出现了乱码,然后就百度了解决方案。便找到了上面那篇文章。
代码原本是用scrapy写的,没有乱码。因为scrapy爬几条数据有些大材小用,我就将其单独写了出来。
乱码的原因是因为编码不对。先将控制台输出格式改一下。
乱码情况如下
{'d_ws': '8å°', 'd_numbers': '05,23,27,24,02,34', 'd_dx': 0, 'd_number_pro': '08', 'd_number': 115, 'd_period': '005æ\x9c', 'd_ds': 0, 'd_days': '2018-01-20', 'd_tt': '0å¤', 'd_ys': 0}
{'d_ws': '4å°', 'd_numbers': '27,37,02,36,21,30', 'd_dx': 0, 'd_number_pro': '04', 'd_number': 153, 'd_period': '004æ\x9c', 'd_ds': 0, 'd_days': '2018-01-16', 'd_tt': '0å¤', 'd_ys': 0}
{'d_ws': '9å°', 'd_numbers': '47,49,31,29,03,43', 'd_dx': 0, 'd_number_pro': '19', 'd_number': 202, 'd_period': '003æ\x9c', 'd_ds': 0, 'd_days': '2018-01-11', 'd_tt': '1å¤', 'd_ys': 0}
{'d_ws': '3å°', 'd_numbers': '01,36,31,49,12,21', 'd_dx': 0, 'd_number_pro': '23', 'd_number': 150, 'd_period': '002æ\x9c', 'd_ds': 0, 'd_days': '2018-01-06', 'd_tt': '2å¤', 'd_ys': 0}
{'d_ws': '6å°', 'd_numbers': '28,42,18,39,12,35', 'd_dx': 0, 'd_number_pro': '36', 'd_number': 174, 'd_period': '001æ\x9c', 'd_ds': 0, 'd_days': '2018-01-02', 'd_tt': '3å¤', 'd_ys': 0}
将编码转换一下就能正常运行了
html = requests.get(url=start_urls, headers=headers).text
html = html.encode("ISO-8859-1")
html = html.decode("utf-8")
response = etree.HTML(html)
我将全部代码也添上了,更加便于理解
import requests
from lxml import etree
from functools import reduce
def get_new_data():
start_urls = 'http://www.kj803.com/History/2018.html'
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"
}
html = requests.get(url=start_urls, headers=headers).text
html = html.encode("ISO-8859-1")
html = html.decode("utf-8")
response = etree.HTML(html)
periods = response.xpath('//tr[@class="infolist"]')
for i in periods:
item = {}
item["d_days"] = i.xpath("./td[1]/text()")[0]
item["d_period"] = i.xpath("./td[2]/text()")[0].strip()[:-1]
numbers = i.xpath("./td[3]/div/div/div[contains(@class,'hm')]/text()")
number_pre = i.xpath("./td[4]/div/div[1]/text()")[0]
item["d_number_pro"] = number_pre
item["d_numbers"] = ",".join(numbers)
item["d_number"] = reduce(lambda x, y: x + y, map(int, numbers))
ds = i.xpath("./td[6]/text()")[0].strip()
ys = i.xpath("./td[7]/text()")[0]
dx = i.xpath("./td[8]/text()")[0]
tt = i.xpath("./td[10]/text()")[0]
ws = i.xpath("./td[11]/text()")[0]
item["d_ds"] = 5 if ds == "单" else 15 if ds == "双" else 0
item["d_ys"] = 5 if ys == "红" else 10 if ys == "绿" else 15 if ys == "蓝" else 0
item["d_dx"] = 5 if dx == "小" else 15 if dx == "大" else 0
item["d_tt"] = tt[:-1]
item["d_ws"] = ws[:-1]
yield item
if __name__ == "__main__":
a = get_new_data()
for i in a:
print(i)