文本/数据操作

大文件读取

1.
with open(...) as f:
    for line in f:
        print(line)
2.
f = open(...)
while True:
    line = f.readline()
    if not line:
        break
3.
f = open(...)
while True:
    block = f.read(1024)
    if not block:
        break

json

中文编码问题:

import json
json.dumps(data, ensure_ascii=False)

排序:

di = {...}
json.dumps(di, sort_keys=True)

from collection import OrderedDict
json.loads(di, objects_pairs_hook=OrderedDict)

url中文转码(python3)

import urllib
urllib.parse.unquote("%xxx%xxx...")

csv

import csv
# 写
with open("xx.csv", "w") as f:
    writer = csv.writer(f, delimiter=",", quolechar="|", dialect='excel')  # 分隔符,引用符,默认excel
    writer.writerow("a")
    writer.writerow([1,2,3])
# 读
with open("xx.csv", "r") as f:
    reader = csv.reader(f, delimiter=",", dialect='excel')  # 分隔符,引用符,默认excel
    column = [row[0] for row in reader]  # 某一列
    for row in reader:  # 每行操作
        do sth...
  • 文件操作模式
r,w 读,写
r+, w+ 都为读写(覆盖式)
a 续写
rb, wb... 二进制

xml

  • lxml
import requests
from lxml import etree
try:
    r = requests.get(url)
    r.raise_for_status()
    r.encoding = "utf-8"
    content = r.content
    selector = etree.HTML(content)
    xx_list = selector.xpath('//h2[@class="ContentItem-title"]/a/text()')
except Exception as e:
    print e
  • parsel
from parsel import Selector
resp = Selector(response.text)

format

  • 占位格式
"{:^10}\t{:^10}\t{:^10}".format(a, b, c)
  • 中文字符居中对齐
# chr(12288) 中文空格
"{0:^10}\t{1:{3}^10}\t{2:^10}".format(a, b, c, chr(12288))

BeautifulSoup

from bs4 import BeautifulSoup as bs

你可能感兴趣的:(文本/数据操作)