Python 文本处理,看完本博文对python cookbook的笔记,足够了。
请善用目录。
更多的,详细的会更新。
# -*- coding: cp936 -*-
import sets
mystr = "abcdefghijklmn"
def do_sth_with(c):
print c
#method 1
#直接内建list,然后for遍历list的每个c
print "method 1"
list1 = list(mystr)
for c in list1:
do_sth_with(c)
#method 2
#不创建list也能for遍历字符串的每个c
print "method 2"
for c in mystr:
do_sth_with(c)
#method 3
#列表推导
print "method 3"
res = [do_sth_with(c) for c in mystr]
#method 4
#map映射
print "method 4"
res = map(do_sth_with,mystr)
#method 5
#set操作,也就set有& | - ^这些操作
magic_chars = sets.Set('abracadabra')
print "magic_chars : " + str(magic_chars)
poppins_chars = sets.Set('supercalifragilisticexpialidocious')
print "poppins_chars : "+str(poppins_chars)
print 'magic_chars和poppins_chars的交集:'+''.join(magic_chars & poppins_chars)
结果:
#1 ~4 都是打印a到n
method 5:
acrd
Python 学习笔记 02 – List 推导式-zhoukeli2005-ChinaUnix博客
python内置函数map/reduce/filter - 云计算架构师-解占辉 - 51CTO技术博客
Python Sets.Set 的& | - ^ 等操作
# -*- coding: cp936 -*-
#ascii
print ord('a')
print chr(97)
#unicode
print ord(u'\u2020')
print repr(unichr(8224))
#把字符串变成各个字符的值的列表
print map(ord,'Hello Wolrd')
#反过来
print ''.join(map(chr,range(97,100)))
结果:
97
a
8224
u'\u2020'
[72, 101, 108, 108, 111, 32, 87, 111, 108, 114, 100]
abc
# -*- coding: cp936 -*-
#最傻的方法,连unicode对象都无法通过,自己的str子类也无法通过
def isExactlyAString(anobj):
return type(anobj) is type('')
#接近完美的方法,basestring是str和unicode的共同基类
#用户自定义类型也是应该从basestring派生
#美中不足的是python标准库的UserString无法通过,因为它不是从basestring派生
def isAString(anobj):
return isinstance(anobj,basestring)
#退而求其次的方法,虽慢,但有效。行为判断!
#python的鸭子判断法,走路像鸭子,叫声像鸭子,那么我们就认为它是鸭子!
def isStringLike(anobj):
try: anobj.lower() + anobj + ''
except: return False
else: return True
print isExactlyAString('a')
print isExactlyAString(u'a')
print isAString('a')
print isAString(u'a') #暂时找不到UserString使用教程
print isStringLike('a')
print isStringLike(u'a')
结果:
True
False
True
True
True
True
# -*- coding: cp936 -*-
#共生成20个字符
print '|','hej'.ljust(20),'|','hej'.rjust(20),'|','hej'.center(20),'|'
#还可以指定分隔字符
print '|','hej'.ljust(20,'#'),'|','hej'.rjust(20,'#'),'|','hej'.center(20,'#'),'|'
结果:
| hej | hej | hej |
| hej################# | #################hej | ########hej######### |
结果看到的首尾的空格是print , 造成的。
# -*- coding: cp936 -*-
x = ' hej '
print '|',x.lstrip(),'|',x.rstrip(),'|',x.strip(),'|'
y = 'ababab xx bababa'
print '|',y.lstrip('ab'),'|',y.rstrip('ab'),'|',y.strip('ab'),'|'
结果:
| hej | hej | hej |
| xx bababa | ababab xx | xx |
# -*- coding: cp936 -*-
l = ['a','b','c','d','e','f','g']
largeString = ''.join(l)
print largeString
结果:
abcdefg
# -*- coding: cp936 -*-
import re
s = 'Hello World'
#步长为-1,表示从尾开始复制
print s[::-1]
#逐词
revwords = s.split()
revwords.reverse()
print ' '.join(revwords)
#逐词简约版
revwords = ' '.join(s.split()[::-1])
print revwords
#正则版本,保留空格
revwords = re.split(r'(\s+)',s)
revwords.reverse()
revwords = ''.join(revwords)
print revwords
#正则简约版
revwords = ''.join(re.split(r'(\s+)',s)[::-1])
print revwords
#reversed 代替 [::-1]
revwords = ' '.join(reversed(s.split( )))
print revwords
revwords = ''.join(reversed(re.split(r'(\s+)',s)))
print revwords
# -*- coding: cp936 -*-
import itertools
import string
'''aset包含seq任意字符即可'''
#method 1 这是最朴素简单实用的短路法。
def containsAny(seq,aset):
for c in seq:
if c in aset: return True
return False
#method 2 装逼写法,依旧短路法
def containsAny2(seq,aset):
for item in itertools.ifilter(aset.__contains__,seq):
return True
return False
#method 3 装的艺术 - maketrans和translate
notrans = string.maketrans('','')
def containsAny3(astr,strset):
return len(astr) != len(astr.translate(notrans,strset))
'''seq包含aset全部字符'''
#method 4 这也是最朴素简单实用的短路法
def containsAll(seq,aset):
for c in aset:
if c not in seq: return False
return True
#method 5 set集合 difference 不是短路法,效率没3高
def containsAll2(seq,aset):
return not set(aset).difference(seq)
#method 6 装的艺术 - maketrans和translate
notrans = string.maketrans('','')
def containsAll3(astr,strset):
return not strset.translate(notrans,astr)
str1 = 'abcdefg'
str2 = 'ghijklm'
str3 = 'abcdef'
str4 = 'abcd'
L1 = [1,2,3,3]
L2 = [1,2,3,4]
print containsAny(str1,str2) #str1 contains any of str2 True
print containsAny2(str1,str2) #str1 contains any of str2 True
print containsAll(str3,str4) #str3 contains all of str4 True
print containsAll(str4,str3) #str4 contains all of str3 False
print containsAll2(L1,L2) #L1 contains aLL of L2 False
print containsAll2(L2,L1) #L2 contains all of L1 True
print containsAny3(str1,str2) #str1 contains any of str2 True
print containsAll3(str3,str4) #str3 contains all of str4 True
print containsAll3(str4,str3) #str4 contains all of str3 False
import string
#这是由Fred L.Drake,Jr 提供的 闭包工厂
#我觉得特别不错!!!忽略delete和keep同时存在的情况,因为那不科学
#如果真要同时存在,我认为分两次处理,才是科学的
def translator(frm='',to='',delete='',keep=None):
if len(to) == 1:
to = to * len(frm)
trans = string.maketrans(frm,to)
if keep is not None:
allchars = string.maketrans('','')
#delete = allchars.translate(allchars,keep)
delete = allchars.translate(allchars,keep.translate(allchars,delete))
def translate(s):
return s.translate(trans,delete)
return translate
digits_only = translator(keep=string.digits)
print digits_only('aaaaa11111111aaaaa') #11111111
no_digits = translator(delete=string.digits)
print no_digits('aaaaa11111111aaaaa') #aaaaaaaaaa
#用字符替换某个集合的字符
digits_to_hash = translator(frm=string.digits,to='#')
print digits_to_hash('aaaaa11111111aaaaa') #aaaaa########aaaaa
import string
#这个只要translate取反keep即可
allchars = string.maketrans('','')
def makefilter(keep):
delchars = allchars.translate(allchars,keep)
def thefilter(s):
return s.translate(allchars,delchars)
return thefilter
#准备阶段
keep = makefilter('abc')
#执行阶段
print keep('abcdefgaaa') # abcaaa
#规整字符串,无重,排好序
def canonicform(s):
return makefilter(s)(allchars)
s = 'aaaaaaaaaasldkfjgh'
print canonicform(s) #adfghjkls
#等同于
aa = makefilter(s)
print aa(allchars) #adfghjkls
import string
#unicode translate 反而更加简单,就一个table,映射即可
tb = dict([ (ord(ch), None) for ch in u" aeiou"])
s = u'four score and seven years ago'
print s.translate(tb) #frscrndsvnyrsg
#本任务中考虑到str要被keep,而不是被del
#要有一个巨大的dict来标记想要的和不想要的!
#Fred L.Drake,Jr提供了一个特不错的方法(大神就是大神):
import sets
class Keeper(object):
def __init__(self,keep): #构造函数
self.keep = sets.Set(map(ord,keep))
def __getitem__(self,n): #重载[]
if n not in self.keep:
return None
return unichr(n)
def __call__(self,s): #被调用触发
return unicode(s).translate(self)
makefilter = Keeper #重命名下
if __name__ == '__main__':
just_vowels = makefilter('aeiouy')
print just_vowels(u'four score and seven years ago')
from __future__ import division #Fred说,这兼容未来?
import string
text_characters = "".join(map(chr,range(32,127))) + "\n\r\t\b"
_null_trans = string.maketrans("","")
def istext(s,text_characters=text_characters,threshold = 0.30):
#threshold 阈值
#s包含了空值,不是文本,空值是文本的结束标志
if "\0" in s:
return False
#逻辑判定:空字符串是文本
if not s:
return True
print 'aa'
t = s.translate(_null_trans,text_characters)
print len(t),len(s),len(t) / len(s)
return len(t) / len(s) <= threshold
def istextfile(filename,blocksize=512):
return istext(open(filename,"rb").read(blocksize))
print istext("123123123123") #True
print istextfile('1.bin') #False
print istextfile('1.txt') #True
#检查字符串是不是符合 开头大写,其余小写的情况
import string
notrans = string.maketrans('','')
def containsAny(str,strset):
return len(str) != len(str.translate(notrans,strset))
#还记得鸭子判断法吗?叫声像(能capitalize)+长得像(containsAny)。
def isCapitaliezd(s):
return s == s.capitalize() and containsAny(s,string.letters)
s1 = 'This is ouyang...'
s2 = 'this is ouyang...'
print isCapitaliezd(s1) #True
print isCapitaliezd(s2) #False
import struct
theline = 'abcdefghijklmnopqrstuvwxyz1234567890'
#struct.unpack()按照指定格式
baseformat = "5s 3x 8s 8s"
numremain = len(theline) - struct.calcsize(baseformat)
format = "%s %ds" % (baseformat,numremain)
l,s1,s2,t = struct.unpack(format,theline)
print l,s1,s2,t
#struct.unpack()这么优秀的东西 - 封装
def fields(baseformat,theline,lastfield = False):
numremain = len(theline) - struct.calcsize(baseformat)
#lastfield and 's' or 'x' == lastfield ? 's':'x'
format = "%s %d%s" % (baseformat,numremain,lastfield and 's' or 'x')
return struct.unpack(format,theline)
print 'lastField == True : ',fields(baseformat,theline,True)
print 'lastField == False : ',fields(baseformat,theline,False)
#struct.unpack()这么优秀的东西 - 封装 - memoizing 版本 - 字典做缓存
def fields_memoizing(baseformat,theline,lastfield = False,_cache={}):
key = baseformat,len(theline),lastfield
format = _cache.get(key)
if format == None:
numremain = len(theline) - struct.calcsize(baseformat)
_cache[key] = format = "%s %d%s" % (baseformat,numremain,lastfield and 's' or 'x')
return struct.unpack(format,theline)
print 'lastField == True : ',fields_memoizing(baseformat,theline,True)
print 'lastField == False : ',fields_memoizing(baseformat,theline,False)
#5字节一组,最后不足5字节也不会越界
fivers = [theline[k:k+5] for k in xrange(0,len(theline),5)]
print fivers
#5字节一组,最后不足5字节也不会越界 - 封装
def split_by(theline,n,lastfield):
pieces = [theline[k:k+n] for k in xrange(0,len(theline),5)]
#如果最后一段太短或不需要,丢弃
if not lastfield and len(pieces[-1]) < n:
pieces.pop()
return pieces
print 'split_by lastField == True : ',split_by(theline,5,True)
print 'split_by lastField == False : ',split_by(theline,5,False)
#指定长度的切片,zip()打包成元组
cuts = [8,14,20,26,30]
pieces = [theline[i:j] for i,j in zip([0]+cuts,cuts+[None])]
print pieces
#指定长度的切片,zip()打包成元组 - 封装
def split_at(theline,cuts,lastfield):
pieces = [theline[i:j] for i,j in zip([0]+cuts,cuts+[None])]
#若不要最后一段,丢弃
if not lastfield :
pieces.pop()
return pieces
print 'split_at lastField == True : ',split_at(theline,cuts,True)
print 'split_at lastField == False : ',split_at(theline,cuts,False)
#指定长度的切片,zip()打包成元组 - 封装 - yield 迭代对象
#这已经不是一个fun()了,而是一个generator(),返回一个迭代器对象
def split_at_yeild(theline,cuts,lastfield = True):
last = 0
for cut in cuts:
yield theline[last:cut]
last = cut
if lastfield:
yield theline[last:]
for i in split_at_yeild(theline,cuts,True):
print i
字典和元组——菜鸟的Python笔记
Python yield 使用浅析
#对齐
def reinent(s,numSpaces):
leading_space = numSpaces * ' '
lines = [leading_space + line.strip() for line in s.splitlines()]
return '\n'.join(lines)
def addSpaces(s,numAdd):
white = " "*numAdd
return white + white.join(s.splitlines(True))
def numSpaces(s):
return [len(line) - len(line.lstrip()) for line in s.splitlines()]
def delSpaces(s,numDel):
if numDel > min(numSpaces(s)):
raise ValueError,"删的空格比最小空格数大"
return '\n'.join(line[numDel:] for line in s.splitlines())
x = """ line one
line two
and line three """
print x
print reinent(x,4)
print addSpaces(x,4)
print delSpaces(x,3)
#把tab转成空格
s = "a\t aaaaa\t aaaa"
s1 = s.expandtabs()
print s,len(s)
print s1,len(s1)
#把空格转成tab
def unexpand(s,tablen = 8):
import re
#切分成空格和非空格
pieces = re.split(r'( +)',s.expandtabs())
#记录当前字符串总长度
lensofar = 0
for i,piece in enumerate(pieces):
thislen = len(piece)
lensofar += thislen
if piece.isspace():
#把每个空格序列改成tab+spaces
numtabs = (thislen-numblanks+tablen-1)/tablen
print numblanks,numtabs
pieces[i]='\t'*numtabs +' '*numblanks
return ''.join(pieces)
s2 = unexpand(s1)
print s1,len(s1)
print s2,len(s2)
def expand(format,d,marker='"',safe = False):
if safe:
#dict.get(key[, default])
def lookup(w):return d.get(w,w.join(marker*2))
else:
def lookup(w):return d[w]
parts = format.split(marker)
#偶数项 就是要替换的子字符串
parts[1::2] = map(lookup,parts[1::2])
return ''.join(parts)
format = 'just a "a" "b" test '
print expand(format,{'a':'one','b':'two'})
#>>>
#just a one two test
#>>>
import string
new_style = string.Template('this is $thing')
print new_style.substitute({'thing':5})
print new_style.substitute({'thing':'test'})
print new_style.substitute(thing = 5)
print new_style.substitute(thing = 'test')
#local() 本地变量
msg = string.Template('the square of $number is $square')
for number in range(10):
square = number * number
print msg.substitute(locals())
import re
adict = {'a':'1','b':'2'}
s = 'abacabc'
print '|'.join(map(re.escape,adict))
def multiple_replaces(text,adict):
robj = re.compile('|'.join(map(re.escape,adict)))
def one_xlat(match):
print '###',match.group(0)
return adict[match.group(0)]
return robj.sub(one_xlat,text)
print multiple_replaces(s,adict)
实验结果:
>>>
a|b
### a
### b
### a
### a
### b
121c12c
>>>
明显发现:
import re
def make_xlat(*args,**kwds):
adict = dict(*args,**kwds)
rx = re.compile('|'.join(map(re.escape,adict)))
def one_xlat(match):
return adict[match.group(0)]
def xlat(text):
return rx.sub(one_xlat,text)
return xlat
adict = {"a":"1","b":"2"}
translate = make_xlat(adict)
print translate('abacacb')
rx = re.compile(r'\b%s\b' % r'\b|\b'.join(map(re.escape,adict))) #单词版本
import re
class make_xlat:
def __init__(self,*args,**kwds):
self.adict = dict(*args,**kwds)
self.rx = self.make_rx()
def make_rx(self):
return re.compile('|'.join(map(re.escape,self.adict)))
def one_xlat(self,match):
return self.adict[match.group(0)]
def __call__(self,text):
return self.rx.sub(self.one_xlat,text)
#重载!
class make_xlat_by_whole_words(make_xlat):
def make_rx(self):
return re.compile(r'\b%s\b' % r'\b|\b'.join(map(re.escape,self.adict)))
adict = {"a":"1","b":"2","xyz":"456"}
translate = make_xlat(adict)
translate_by_whole_words = make_xlat_by_whole_words(adict)
print translate('aba xyz cacb')
print translate_by_whole_words('aba xyz cacb')
''' 结果:
>>>
121 456 c1c2
aba 456 cacb
>>>
'''
def imap(function, *iterables):
# imap(pow, (2,3,10), (5,2,3)) --> 32 9 1000
iterables = map(iter, iterables)
while True:
args = [next(it) for it in iterables]
if function is None:
yield tuple(args)
else:
yield function(*args)
1、先把参数都变成iter对象,iter是个funciton
import itertools,os
def anyTrue(predicate,sequence):
return True in itertools.imap(predicate,sequence)
def endswith(s,*endings):
return anyTrue(s.endswith,endings)
for f in os.listdir('.'):
if endswith(f,'.jpg','.png','.jepg'):
print f
'''
>>>
psu.jpg
QQ图片20130927105447.jpg
>>>
'''
L = ['a','b','c']
x = L.append
x('d')
print L
这里的x('d') == L.append('d'),x也就是L的一个被绑定办法
x = List.append
x(L,'d') == L.append('d')
german_ae = unicode('\xc3\xa4','utf8')
print german_ae
'''
>>>
ä
>>>
'''
>>> import this
The Zen of Python, by Tim Peters
Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!
>>>
这里放下预言:将来某一天一定会被这个Unicode和str搞死的。
拓展阅读:
str和unicode解析(这是一段有趣的对话,明白encode和decode的真正意义)
pass
if sys.stdout.isatty():
# You're running in a real terminal
else:
# You're being piped or redirected
formatter的相关资料