#!/usr/bin/env python #-*- coding:UTF-8 -*- ##################################################### # Author: sunfx [email protected] # Last modified: 2014/11/18 # Filename: re.py # Q Q 群: 236147801 ##################################################### import re #1.查找文本中的字符 pattern = 'this' text = 'Does this text match the pattern?' match = re.search(pattern,text) s = match.start() e = match.end() print 'Found "%s"\nin "%s"\nfrom %d to %d ("%s")' %\ (match.re.pattern,match.string,s,e,text[s:e]) ''' match.re.pattern 要匹配的内容 match.string 匹配的字符 s 匹配到内容开始索引 d 匹配到内容结束索引 text[s:e] 匹配字符 ''' #2.编译表达式 regexes = [ re.compile(p) for p in ['this','that'] ] #把字符转换Regexobject格式 print 'Text: %r\n' % text #输出text内容 for regex in regexes: print 'Seeking "%s"->' % regex.pattern, #regex.pattern 要匹配的字符 if regex.search(text): #在text中搜索this or that print 'match!' else: print 'no match' #3.多重匹配 text = 'abbaaabbbbaaaaa' pattern = 'ab' for match in re.findall(pattern,text): print 'Found: "%s"' % match #findall 直接返回字符串 for match in re.finditer(pattern,text): s = match.start() e = match.end() print 'Found "%s" at %d:%d' % (text[s:e],s,e) #finditer 返回原输入文字在字符串的位置 #4.模式语法 def test_patterns(text,patterns=[]): for pattern,desc in patterns: print 'Pattern %r (%s) \n' %(pattern,desc) print ' %r' % text for match in re.finditer(pattern,text): s = match.start() e = match.end() substr = text[s:e] #匹配到的字符 n_backslashes = text[:s].count('\\') #查找文本:s坐标之前的包含多少\\ prefix = '.' * ( s + n_backslashes ) print ' %s%r' % (prefix,substr) print return test_patterns('abbaaabbbbaaaaa', [('ab',"'a' followed by 'b'")] ) #贪婪模式 这种模式会减少单个匹配减少 ''' * '匹配一次到多次' + '至少匹配一次到多次' ? '只匹配一次' ab*, 'a followerd by zero or more b'), #匹配0次或者更多次 ab+, 'a followerd by one or mrore b'), #最少匹配一次或者更多次 ab?, 'a followerd by zero or one b'), #匹配0最多一次 ab{3}, 'a followerd by three b'), #最少匹配三次 ab{2,3}, 'a followerd by two to three b') #匹配两至三次 ab*?, 'a followerd by zero or more b'), #匹配0次或者更多次 ab+?, 'a followerd by one or mrore b'), #最少匹配一次或者更多次 ab??, 'a followerd by zero or one b'), #匹配0最多一次 ab{3}?, 'a followerd by three b'), #最少匹配三次 ab{2,3}?, 'a followerd by two to three b') #匹配两至三次 ''' #用法如下: str = 'absdsdsdsdsd' print re.findall('ab*',str) #['ab'] print re.findall('ab*?',str) #['a'] #5.字符集 ''' [ab] 'either a or b 匹配a或者b' a[ab]+ 'a followerd by 1 more a or b 匹配一次a、b或者多次 ' a[ab]+? 'a followerd by 1 or more a or b,not greedy 匹配1一次可以匹配多次' [^] '不包含内容' [a-z] '所有小写ASCII字母' [A-Z] '所有大写写ASCII字母' [a-zA-Z] '一个小写和大写的序列' [A-Za-z] '一个大写小写的序列' ''' str ='aaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbasbsbab,a_baba' print re.findall('[ab]',str) print re.findall('a[ab]+',str) print re.findall('a[ab]+?',str) print re.findall('[^_]',str) str = 'China,lovE' print re.findall('[a-z][A-Z]',str) #['vE'] print re.findall('[A-Z][a-z]',str) #['Ch'] print re.findall('[A-Z][a-z]+',str) #['China'] print re.findall('[a-z][A-Z]+',str) #['vE'] print re.findall('[A-Z][a-z]*',str) #['China', 'E'] print re.findall('[a-z][A-Z]*',str) #['h', 'i', 'n', 'a', 'l', 'o', 'vE'] print re.findall('[A-Z][a-z]?',str) #['Ch', 'E'] print re.findall('[a-z][A-Z]?',str) #['h', 'i', 'n', 'a', 'l', 'o', 'vE'] ''' . 元字符匹配一个字符 a. b. a.*b a.*?b ''' c = 'woaizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbbsd' print re.findall('a.',c) #['ai', 'aw', 'as', 'aa', 'ab'] print re.findall('b.',c) #['b,', 'bs', 'ba', 'bb', 'bb', 'bb', 'bs'] print re.findall('a.*b',c) #['aizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbb'] #贪婪模式匹配a到b之间的任意字符长度字符 print re.findall('a.*?b',c) #['aizhongguoawsb', 'asssssssssssssdsdsdsdb', 'aaab'] # ?结束了* 的贪婪模式, #它不会到最后一个b再去匹配而且见好就收,匹配可能最短的字符 #6.转义码 ''' 转义码 含义 \d 一个数字 \D 一个非字符 \s 空白符(制表符、空格、换行符) \S 非空白符(符号、字母、数字) \w 字母数字 \W 非字母数字(符号、制表符、空格、换行符) ''' #7.锚定 ''' 锚定码 含义 ^ 字符串或行的开始 $ 字符串或行结束 \A 字符串开始 \Z 字符串结束 \b 一个单词开头或者末尾的空串 \B 不在一个单词的开头活末尾的空串 ''' #8.限制搜索 match、search text = 'This is some text --with punctuation.' pattern = 'is' print 'Text :',text print 'pattern:',pattern m = re.match(pattern,text) #因为match是从字符开头开始匹配 is没有在开头所以没有匹配到. print 'Match :',m s = re.search(pattern,text) #is在文本中出现了两次所以匹配到内容 print 'Search :',s pattern = re.compile(r'\b\w*is\w*\b') #编译规则 print 'Text:',text pos = 0 while True: match = pattern.search(text,pos) #搜索规则 if not match: break s = match.start() e = match.end() print ' %d : %d = "%s"' % (s,e-1,text[s:e]) pos = e #9 用户组解析匹配(任何一个正则都可以为组并嵌套在一个更大的表达式中) regex = re.compile(r'(\bt\w+)\W+(\w+)') print 'Input text :',text print 'Pattern :',regex.pattern match = regex.search(text) print 'Entire match :',match.group(0) #表示整个表达式的字符串,子组从1开始排序 print 'World start with "t":',match.group(1) #匹配到的第一组 print 'World after "t" word :',match.group(2) #匹配到的第二组 #python对基本分组进行了扩展 (?P<name>pattern) print text print for pattern in [ r'^(?P<first_word>\w+)', #组名和正则表达式组成 r'(?P<last_word>\w+)\S*$', r'(?P<t_word>\bt\w+)\W+(?P<other_word>\w+)', r'(?P<ends_with_t>\w+t)\b', ]: regex = re.compile(pattern) match = regex.search(text) print 'Matching "%s"' % pattern print ' ',match.groups() #匹配到所有的组的值 print ' ',match.groupdict() #把组名和字串生成字典 print def test_patterns(text,patterns=[]): '''Given source text and a list of patterns,look for matches for each pattern within the text and print them to stdout. ''' #look for each pattern in the text and print the resuls for pattern,desc in patterns: print 'Pattern %r (%s)\n' % (pattern,desc) print ' %r' % text for match in re.finditer(pattern,text): s = match.start() e = match.end() prefix = ' ' * (s) #'空格 X 次数' print ' %s%r%s' % (prefix,text[s:e],' '*(len(text)-e)), print match.groups() if match.groupdict(): print '%s%s' % (' ' * (len(text) -s),match,groupdict()) print return print test_patterns(text,[(r'(a(a*)(b*))','a followerd by 0-n a and 0-n b')]) ''' | 代表左右表达式任意匹配一个,他总是先尝试匹配左边的表达式,一旦成功匹配则 跳过匹配右边的表达式。如果|没有被包括()中,则它的范围是整个正则表达式 ?:pattern ''' #10.搜索选项 - 不区分大小写的匹配 ''' re.IGNORECASE 忽略大小写 ''' text = 'This is some text -- with punctuation.' pattern = r'\bT\w+' with_case = re.compile(pattern) whitout_case = re.compile(pattern,re.IGNORECASE) #re.IGNORECASE 忽略大小写 print 'Text: \n %r' % text print 'Pattern:\n %s' % pattern print 'Case-sensitive:' for match in with_case.findall(text): print ' %r' % match print 'Case-insensitive:' for match in whitout_case.findall(text): print ' %r' % match #11.多行输入 ''' MULTILINE 多行匹配 ''' text = 'This is some text -- with punctuation.\nA secone lines.' pattern = r'(^\w+)|(\w+\S*$)' single_line = re.compile(pattern) multiline = re.compile(pattern,re.MULTILINE) print 'Text:\n %r' % text print 'Pattern:\n %s' % pattern print 'Single Line :' for match in single_line.findall(text): print ' %r' % (match,) print 'MULTILINE :' for match in multiline.findall(text): print ' %r' % (match,) ''' DOTALL 让点字符也可以匹配换行符 ''' pattern = r'.+' no_newlines = re.compile(pattern) dotall = re.compile(pattern,re.DOTALL) print 'Text :\n %r' % text print 'Pattern:\n %s' % pattern print 'No newlines :' for match in no_newlines.findall(text): print ' %r' % match print 'Dotall :' for match in dotall.findall(text): print ' %r' % match #12 Unicode匹配 ''' re.UNICODE 匹配Unicode ''' import codecs import sys #set standard output encoding to UTF-8 sys.output = codecs.getwriter('UTF-8')(sys.stdout) pattern = ur'\w+' ascii_pattern = re.compile(pattern) unicde_pattern = re.compile(pattern,re.UNICODE) print 'Text :',text print 'Pattern :',pattern print 'ASCII :',u', '.join(ascii_pattern.findall(text)) print 'Unicode :',u', '.join(unicde_pattern.findall(text)) ''' re.VERBOSE 让正则更容易读 ''' address = re.compile( ''' [\w\d.+-]+ #username @ ([\w\d.]+\.)+ #domain name prefix (com|org|edu) #TODO:support more top-level domains ''', re.UNICODE | re.VERBOSE) candidates = [ u'[email protected]', u'[email protected]', u'[email protected]', u'[email protected]' ] for candidate in candidates: match = address.search(candidate) print '%-30s %s' % (candidate,'Matche' if match else 'no match') address = re.compile ( ''' #A name is made up of letters,and may include "." #for title abbreviations and middle initials. ((?P<name> ([\w.,]+\S+)*[\w.,]+) \s* # Email addresses are wrapped in angle # brackets: <> but only if a name is # found, so keep the start bracket in this # group. < )? # the entire name is optional # the address itself:[email protected] (?P<email> [\w\d.+-]+ #username @ ([\w\d.]+\.)+ #domain name prefix (com|org|edu) #TODO:support more top-level domains ) >? # optional closeing angle break ''', re.UNICODE | re.VERBOSE) candidates = [ u'[email protected]', u'[email protected]', u'[email protected]', u'[email protected]' u'Fist Last <[email protected]>' u'NO Brackets first.last@example', u'First Last', u'First Middle Last <[email protected]>', u'First M. Last <[email protected]>', u'<[email protected]>', ] for candidate in candidates: print 'candidate:',candidate match = address.search(candidate) if match: print ' Name:',match.groupdict()['name'] print ' Email:',match.groupdict()['email'] else: print ' No match' ''' 正则表达式标志缩写表 标志 缩写 描述 IGNORECASE i 忽略大小写 MULTILINE m 多行匹配 DOTALL s 让点字符也可以匹配换行符 UNICODE u 匹配Unicode VERBOSE x 让正则更容易读 在模式中嵌入标签(?imu)会打开相应的选项 ''' text = 'This is some text -- with punctuation.' pattern = r'(?i)\bT\w+' regex = re.compile(pattern) print 'Text :',text print 'Pattern :',pattern print 'Matches :',regex.findall(text) #13 前向或后向 address = re.compile( ''' # A name is made up of letters, and may include "." # for title abbreviations and middle initials ((?P<name> ([\w.,]+\s+)*[\w.,]+ ) \s+ ) # name is no longer optional # LOOKAHEAD # Email address are wrapped in angle brackets, but only # if they are both present or neither is . (?= (<.*>$) | ([^<].*[^>]$) ) <? # optional opening angle bracket # The address itself: [email protected] (?P<email> [\w\d.+-]+ @ ([\w\d.]+\.)+ (com|org|edu) ) >? ''', re.UNICODE | re.VERBOSE) candidates = [ u'First Last <[email protected]>', u'No Brackets [email protected]', u'Open Brackets <[email protected]>', u'Close Brackets [email protected]', ] for candidate in candidates: print 'Candidate:',candidate match = address.search(candidate) if match: print ' Name :',match.groupdict()['name'] print ' Email :',match.groupdict()['email'] else: print ' No match' #自动忽略系统常用的noreply邮件地址 ''' (?!noreply@.*$) 忽略这个邮件地址 (?<!noreply>) 两种模式 写在username之前不会向后断言 (?<=pattern) 用肯定向后断言查找符合某个模式的文本 ''' address = re.compile( ''' ^ # An address: [email protected] # Ignore noreply address (?!noreply@.*$) [\w\d.+-]+ # username @ ([\w\d.]+\.)+ # domain name prefix (com|org|edu) # limit the allowed top-level domains $ ''', re.UNICODE | re.VERBOSE) candidates = [ u'[email protected]', u'[email protected]', ] for candidate in candidates: print 'Candidate:',candidate match = address.search(candidate) if match: print ' Match:',candidate[match.start():match.end()] else: print ' No match' twitter = re.compile( ''' # A twitter handle: @username (?<=@) ([\w\d_]+) # username ''', re.UNICODE | re.VERBOSE) text = ''' This text includes two Twitter handles. One for @TheSF,and one for the author,@doughellmann. ''' print text for match in twitter.findall(text): print 'handle:',match #14 自引用表达式 #可以把表达式编号后面来引用 address = re.compile( ''' (\w+) # first name \s+ (([\w.]+)\s+)? # optional middle name or initial (\w+) # last name \s+ < # The address: [email protected] (?P<email> \1 #first name \. \4 #last name @ ([\w\d.]+\.)+ (com|org|edu) ) > ''', re.UNICODE | re.VERBOSE | re.IGNORECASE) candidates = [ u'First Last <[email protected]>', u'Different Name <first.last.example.com>', u'First Middle Last <[email protected]>', ] for candidate in candidates: print 'Candidate:',candidate match = address.search(candidate) if match: print ' Match name:',match.group(1),match.group(4) else: print ' No match' #正则表达式解析包括一个扩展,可以使用(?P=name)指示表达式先前匹配的一个命名组的值. address = re.compile( ''' # The regular name (?P<first_name>\w+) \s+ (([\w.]+)\s+)? (?P<last_name>\w+) \s+ < # The address: [email protected] (?P<email> (?P=first_name) \. (?P=last_name) @ ([\w\d.]+\.)+ (com|org|edu) ) > ''', re.UNICODE | re.VERBOSE | re.IGNORECASE) candidates = [ u'First last <[email protected]>', u'Different Name <[email protected]>', u'First Middle last <[email protected]>', u'First M. Last<[email protected]>', ] for candidate in candidates: print 'Candidate:',candidate match = address.search(candidate) if match: print ' Match name:',match.groupdict()['first_name'] print match.groupdict()['last_name'] print ' Match email:',match.groupdict()['email'] else: print 'No match' #15 用模式修改字符串 ''' re支持使用正则表达式作为搜索机制来修改文本,而且可以替换可以引用正则表达式中的匹配组作为替换文本的一部分。 ''' bold = re.compile(r'\*{2}(.*?)\*{2}') text = 'Make this **bold**. This **too**.' print 'Text:',text print 'Bold:',bold.sub(r'<b>\1</b>',text) ''' 使用命名组来替换 count 来限制替换次数 sbun 工作原理和sub相似 subn同时返回修改后的字符串和完成的替换次数 ''' bold = re.compile(r'\*{2}(?P<bold_text>.*?)\*{2}',re.UNICODE,) print 'Text:',text print 'Bold:',bold.sub(r'<b>\g<bold_text></b>',text,count=1) #16 利用模式拆分 ''' str.split() 是分解字符串来完成解析的最常用方法之一,它只是支持字面值得作为分隔符 ''' text = '''Paragraph one one tuo lines. Paragraph two. Paragraph three.''' print 'With findall:' for num,para in enumerate(re.findall(r'.+?\n{2,}|$', text, flags = re.DOTALL) ): print num,repr(para) print print print 'With split:' for num,para in enumerate(re.split(r'\n{2,}',text)): print num,repr(para) print