# -*- coding: utf-8 -*-
"""
Created on Fri Dec 27 11:24:49 2019
@author: jerry
"""
import re
import requests
from bs4 import BeautifulSoup
import csv
def pick_charset(html):
"""
从文本中提取 meta charset 网站编码类型
:param html:
:return:
# """
charset = None
m = re.compile(', re.I).search(html)
if m and m.lastindex == 2:
charset = m.group(2).lower()
return charset
if __name__ == '__main__':
csvFileName = "url列表.csv"
with open(csvFileName,newline='',encoding='UTF-8-sig') as csvfile:
rows=csv.reader(csvfile)
for row in rows:
url=row[0]
res = requests.get(url)
#调用pick_chartset()函数获取网站编码类型,根据原网页编码类型进行编码
res.encoding = pick_charset(url)
soup = BeautifulSoup(res.text, 'lxml')
print(soup.title.text) #提取title并打印
#保存到csv文件
with open('查询结果.csv', 'a+', encoding='utf-8-sig') as f:
f.write(url + ',' + soup.title.text + '\n')
# urlTuple = ("http://www.baidu.com","http://www.jd.com","http://www.qq.com")
# for url in urlTuple:
# res = requests.get(url)
# res.encoding = pick_charset(url) #调用pick_chartset()函数获取网站编码类型
# soup = BeautifulSoup(res.text, 'lxml')
# print(soup.title.text) #提取title并打印
# #保存到csv文件,将文件存储为utf-8-sig编码格式,即utf-8无BOM编码格式
# with open('查询结果.csv', 'a+', encoding='utf-8-sig') as f:
# f.write(url + ',' + soup.title.text + '\n')
python 读取带BOM的utf-8格式文件的问题:
1、UTF-8分为两种,一种是不带BOM的,一种是带BOM的。其中第一种不带BOM的是标准形式,第二种带BOM的主要是微软的习惯。
2、微软在UTF-8中使用BOM(Byte order mark)是因为这样可以将UTF-8和ASCII等编码明确区分开。 windows对于utf-8格式的文件存储默认是带有BOM的格式,来标记文本文件的编码方式的。
3、在windows上使用open打开带BOM的utf-8编码的文件时,python会将第一行开头的BOM信息当作文本进行解析。