大文件读取
1.
with open(...) as f:
for line in f:
print(line)
2.
f = open(...)
while True:
line = f.readline()
if not line:
break
3.
f = open(...)
while True:
block = f.read(1024)
if not block:
break
json
中文编码问题:
import json
json.dumps(data, ensure_ascii=False)
排序:
di = {...}
json.dumps(di, sort_keys=True)
from collection import OrderedDict
json.loads(di, objects_pairs_hook=OrderedDict)
url中文转码(python3)
import urllib
urllib.parse.unquote("%xxx%xxx...")
csv
import csv
# 写
with open("xx.csv", "w") as f:
writer = csv.writer(f, delimiter=",", quolechar="|", dialect='excel') # 分隔符,引用符,默认excel
writer.writerow("a")
writer.writerow([1,2,3])
# 读
with open("xx.csv", "r") as f:
reader = csv.reader(f, delimiter=",", dialect='excel') # 分隔符,引用符,默认excel
column = [row[0] for row in reader] # 某一列
for row in reader: # 每行操作
do sth...
- 文件操作模式
r,w 读,写
r+, w+ 都为读写(覆盖式)
a 续写
rb, wb... 二进制
xml
- lxml
import requests
from lxml import etree
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = "utf-8"
content = r.content
selector = etree.HTML(content)
xx_list = selector.xpath('//h2[@class="ContentItem-title"]/a/text()')
except Exception as e:
print e
- parsel
from parsel import Selector
resp = Selector(response.text)
format
- 占位格式
"{:^10}\t{:^10}\t{:^10}".format(a, b, c)
- 中文字符居中对齐
# chr(12288) 中文空格
"{0:^10}\t{1:{3}^10}\t{2:^10}".format(a, b, c, chr(12288))
BeautifulSoup
from bs4 import BeautifulSoup as bs