1.正则表达式是一个特殊的字符序列,一个字符串是否与我们所设定的字符序列相匹配
2.快速检索文本、实现一些替换文本操作
举个栗子:
# 判断 Python是否在字符串a中
a = 'C|C++|JAVA|C#|Python|JavaScript'
# python 内置函数判断
print(a.index('Python') > 1) # True
print('Python' in a) # True
# 正则表达式
import re
r = re.findall('Python', a)
if len(r) > 0:
print("字符串中包含Python")
print(r)
# 2.提取字符串b中的数字
b = 'C0C++1JAVA2C#3Python4JavaScript9'
# 使用正则表达式 ‘\d’
r = re.findall('\d', b)
print(r)
# 上述中 'Pyhton'是普通字符 '\d'是元字符
---------------------------------输出---------------------------
True
True
字符串中包含Python
['Python']
['0', '1', '2', '3', '4', '9']
# 字符集
import re
s = 'abc, acc, adc. aec, afc, ahc'
# 找出s中中间字母是c或者f的单词
r = re.findall('a[cf]c', s)
print(r) # ['acc', 'afc']
# 找出s中中间字母不是c或者f的单词, ^取反操作
r = re.findall('a[^cf]c', s)
print(r) # # ['abc', 'adc', 'aec', 'ahc']
# 找出s中中间字母c-f的单词,
r = re.findall('a[c-f]c', s)
print(r) # ['acc', 'adc', 'aec', 'afc']
\d | 匹配所有数字 | \D | 匹配所有字母 |
---|---|---|---|
\w | 匹配出所有数字和字母 等同于[A-Za-z0-9_] |
\W | 匹配非单词字符 |
\s | 匹配空白字符 | \S | 匹配非空白字符 |
. | 匹配除换行符\n之外的所有字符 |
b = 'C0 C++&1JAVA2C#3Python4JavaScript\n9'
r1 = re.findall('\d', b) # ['0', '1', '2', '3', '4', '9']
r2 = re.findall('[0-9]', b) # ['0', '1', '2', '3', '4', '9']
r3 = re.findall('\w', b)
r4 = re.findall('[A-Za-z0-9_]', b)
r5 = re.findall('\W',b)
print(r1, r2)
print(r3)
print(r4)
print(r5)
# 数量词
import re
a = 'python 1111java686php'
# 匹配单词
r1 = re.findall('[a-z]{3}', a) # ['pyt', 'hon', 'jav', 'php']
r = re.findall('[a-z]{3,6}', a) # ['python', 'java', 'php']
print(r)
# * 匹配*前的字符0次或者无限次
b = 'pytho0python1pythonn2'
rb = re.findall('python*', b) # ['pytho', 'python', 'pythonn']
# + 匹配1次或者无限次
rb = re.findall('python+', b) # ['python', 'pythonn']
# ? 匹配0次或者1次 可以对文本去重
rb = re.findall('python?', b) # ['pytho', 'python', 'python'] 注意第三个python
print(rb)
# 贪婪 与 非贪婪 ?
r = re.findall('[a-z]{3,6}', a) # 贪婪匹配
r = re.findall('[a-z]{3,6}?', a) # 非贪婪匹配 ['pyt', 'hon', 'jav', 'php']
# 非贪婪模式 取界限的最小值
r3 = rb = re.findall('python{1,2}', b) # ['python', 'pythonn']
r3 = rb = re.findall('python{1,2}?', b) # ['python', 'python']
print(r3)
# 边界匹配
import re
qq = '761330110'
# 判断qq是否是4~8位
r = re.findall('^\d{4,8}$', qq) # ^指从字符串开始匹配,$从末尾匹配
r = re.findall('110$',qq) # ['110']
print(r)
# 组 使用 ()
import re
a = 'PythonPythonPythonPythonPythonPython'
# (abc) 中abc 是且的关系
# [abc] 中abc 是或的关系
r = re.findall('(Python)', a)
print(r)
# 匹配模式参数
import re
l = 'PythonC#\nJavPHP'
r = re.findall('c#.{1}', l, re.I | re.S) # re.I忽略大小写 | re.S表示.将匹配所有的字符
print(r) # ['C#\n']
re.sub
正则替换:# re.sub正则替换
import re
l = 'PythonC#JavPHPC#'
# 内置函数替换,由于字符串是不可变的,所以要重新赋值
#l = l.replace('C#', 'GO')
#print(l)
# 正则替换
r = re.sub('C#', 'GO', l) # PythonGOJavPHPGO
r = re.sub('C#', 'GO', l, 0) # PythonGOJavPHPGO 第4个参数0是默认值,无限替换(将所有C#替换成GO)
r = re.sub('C#', 'GO', l, 1) # PythonGOJavPHPC# 此时只替换一次
#print(r)
def convert(value):
matched = value.group()
#print(matched)
return '!!'+matched+'!!'
# 可以传入函数
r = re.sub('C#', convert, l)
print(r) # PythonJavPHP
# 把函数作为参数传递
import re
# 替换规则:找出字符串s中所有数字,若是大于等于6的替换成9,小于6的替换成0
s = 'ABC372154D58'
def convert(value):
matched = value.group()
if int(matched) >= 6:
return '9'
else:
return '0'
r = re.sub('\d', convert, s)
print(r)
re.match() re.search() re.findall()
:# re.match re.search
import re
s = 'ABC372154D58'
s1 = '886372154D58'
r = re.match('\d', s) # 从字符串首字母开始匹配
print(r) # None
# ------------------------------------------------------------
r1 = re.match('\d', s1) # 从字符串首字母开始匹配
print(r1) #
print(r1.span()) # (0, 1)
# ------------------------------------------------------------
r2 = re.search('\d', s) # 搜索整个字符串
print(r2) #
print(r2.group()) # 3
# ------------------------------------------------------------
r3 = re.findall('\d', s)
print(r3) # ['3', '7', '2', '1', '5', '4', '5', '8']
# group分组
import re
# 取出字符串s中life python中间的内容
s = 'life is short i, i use python, i love python'
# -------------------------------------------------
r = re.search('life(.*)python(.*)python', s)
print(r.group(0)) # life is short i, i use python
print(r.group(1)) # is short i, i use
print(r.group(2)) # , i love
print(r.group(0,1,2)) # ('life is short i, i use python, i love python', ' is short i, i use ', ', i love ')
print(r.groups()) # (' is short i, i use ', ', i love ')
# -------------------------------------------------
r = re.findall('life(.*)python(.*)python', s)
print(r) # [(' is short i, i use ', ', i love ')]