正则表达式

# 链接资料

https://docs.python.org/zh-cn/3/library/re.html
https://www.runoob.com/python/python-reg-expressions.html

# 概念

正则表达式是一个特殊的字符序列，它能帮助你方便的检查一个字符串是否与某种模式匹配。

# 基本方法

正则表达式：
re模块
import re

 re模块：
    	re.compile（编译）
      re.match()  从开头匹配一次
      re.search()  只匹配一次
      re.findall()  查找所有
      re.sub(正则表达式, "新内容（可以函数）", str)
      split() result = re.split(r'[,:]',"a:100,b:100")

基础:
. 任意字符
[] 范围
| 或者
() 一组

量词：
* >=0
+ >=1
? 0,1
{m} =m
{m,} >=m
{m,n} >=m <=n

预定义：
\\s 匹配任意空白字符
\\S 匹配任意非空字符
\\d 数字
\\D not 数字
\\w 字符 [0-9A-Za-z]
\\W not 字符
\\b
\\D

分组：
 分组:() ---> result.group(1) 获取组中匹配内容
    不需要引用分组的内容：
        result = re.match(r"<[0-9a-zA-Z]+>(.+)</[0-9a-zA-Z]+>", msg)
    引用分组匹配内容：
        1.number方式
            result2 = re.match(r"<([0-9a-zA-Z]+)><([0-9a-zA-Z]+)>(.*)</\\2></\\1>$", msg1)
        2.?P<名字>
            result = re.match(r'<(?P<name1>\\w+)><(?P<name2>\\w+)>(.+)</(?P=name2)></(?P=name1)>$', msg1)
    

Python里数量词默认是贪婪的(少数语言是非贪婪的)，匹配更多的字符
非贪婪则相反，总是尝试匹配更少的的字符。

在* ? + {m,n}后面加上?,变成非贪婪

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

# 正则练习

# match,search，只匹配一次

msg = "一二三四五六七八一二四五"

result = re.match("一二", msg)  # 从头开始找一二，没有找到就返回None
print(result)

result = re.search("四五", msg)  # 从字符串中找四五，找到一个就返回
print(result)

print(result.group(), result.span())  # 用group()打印获取的内容， span打印位置
"""
<re.Match object; span=(0, 2), match='一二'>
<re.Match object; span=(3, 5), match='四五'>
四五 (3, 5)
"""

1
2
3
4
5
6
7
8
9
10
11
12
13
14

# 找 y2a 字母数字字母

# 找 y2a 字母数字字母
import re

msg = "f8re0ds9fd8er9sd"
result = re.findall("[a-z][0-9][a-z]", msg)  # 从开头匹配，匹配到一个就退出
print("match:", result)

result = re.search("[a-z][0-9][a-z]", msg)  # 从字符串msg里面开始匹配，匹配到第一个就退出
print("search:", result.group())

result = re.findall("[a-z][0-9][a-z]", msg)  # 从字符串msg开始匹配，符合要求的全部匹配，以列表的形式呈现
print("findall:", result)

"""
match: ['f8r', 'e0d', 's9f', 'd8e', 'r9s']
search: f8r
findall: ['f8r', 'e0d', 's9f', 'd8e', 'r9s']
"""

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

# 匹配a1a a23a a2323a 开头结尾是字母，中间是数字

# 匹配a1a a23a a2323a  开头结尾是字母，中间是数字
import re

msg = "ad9sd98sd0909sdf"
result = re.findall("[a-z][0-9]+[a-z]", msg)  # [0-9]+ 表示出现1次到多次
print("+(表示1次到多次):", result)

"""
+(表示1次到多次): ['d9s', 'd98s', 'd0909s']
"""

1
2
3
4
5
6
7
8
9
10

# qq号码验证 5-11位

# qq号码验证 5-11痊
import re

s = "12346587"
result = re.match("^[1-9][0-9]{4,10}$", s)  # {m,n}  m表示最少匹配几个， n，最多匹配几个
print("是QQ号码", result.group())  

"""
是QQ号码 12346587
"""

1
2
3
4
5
6
7
8
9
10

# 用户名可以是字母或者数字，不能是数字开关，用户名长度必须6位以上

# 例：用户名可以是字母或者数字，不能是数字开关，用户名长度必须6位以上
import re

username = "fe23asd"
result = re.match("[a-zA-z][a-zA-Z0-9]{5,}$", username)  # 一定要注意^开关匹配和&结尾匹配的用法
print("可正确的用户法：", result.group())

"""
可正确的用户法： fe23asd
"""

1
2
3
4
5
6
7
8
9
10

# 文件名的后缀为.py的文件

# 文件名的后缀为.py的文件
import re

msg = "a.py a.txt kk.py a*py asd.pyfd"
result = re.findall("\\\\w+\\.py\\\\b", msg)
result1 = re.findall(r"\\w+\\.py\\b", msg)  # 原生字符串r
print(result, result1)

"""
['a.py', 'kk.py'] ['a.py', 'kk.py']
"""

1
2
3
4
5
6
7
8
9
10
11

# 匹配数据0-100

# 匹配数字0-100
n = "56"
result = re.match(r'[1-9]?\\d?$|100$', n)  # [1-9]?\\d?$| 匹配1-99的数 |:或者
print(result.group()) # 56

1
2
3
4

验证邮箱

# (qq|163|126)表示可能是qq,163或者126
# [abc123] 表示就是： a,b,c,1,2,3任意字符
# 验证输入邮箱 163 qq 126
import re

email = "1986458@163.com"
result = re.match(r'\\w{5,20}@(qq|163|126)\\.(com|cn)$', email)
print(result)

"""
<re.Match object; span=(0, 15), match='1986458@163.com'>
"""

1
2
3
4
5
6
7
8
9
10
11
12

不是以4，7结尾的手机号码（11位)

# 不是以4，7结尾的手机号码（11位)
import re

phone = "12462686539"
result = re.match(r"1\\d{9}[0-35689]", phone)
print(result)

1
2
3
4
5
6

区号

# 区号
import re

msg = "010-12345678"
result = re.match(r"(\\d{3}|\\w{4})-(\\d{8}$)", msg)
# 分组 ():表示分组 group(1)表示第一个括号里面的(\\d{3}|\\w{4}) group(2)表示第二个括号里面的(\\d{8}$)
print(result.group())  # 010-12345678
print(result.group(1))  # 010
print(result.group(2))  # 12345678

"""
010-12345678
010
12345678
"""

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

匹配网址

# 匹配网址
# 例1：
import re

msg = '<html>abc</html>'
result = re.match(r"<[0-9a-zA-Z]+>(.+)</[0-9a-zA-Z]+>", msg)
print(result.group(0))  # <html>abc</html>
print(result.group(1))  # abc

# 例2  #这里的\\1就是前面的group(1)中的内容([0-9a-zA-Z]+)
result1 = re.match(r"<([0-9a-zA-Z]+)>(.+)</\\1>$", msg)
print(result1.group(1))  # abc
print(result1.group(2))  # html

# 例3  一定要一一对应
msg1 = "<html><h1>abcdef</h1></html>"
result2 = re.match(r"<([0-9a-zA-Z]+)><([0-9a-zA-Z]+)>(.*)</\\2></\\1>$", msg1)
print(result2.group(0))
print(result2.group(1))
print(result2.group(2))
print(result2.group(3))

"""
<html><h1>abcdef</h1></html>
html
h1
abcdef
"""
# 为上面的起名  起名的方式： (?P=<名字>正则)   （?P=名字）
result = re.match(r'<(?P<name1>\\w+)><(?P<name2>\\w+)>(.+)</(?P=name2)></(?P=name1)>$', msg1)
print(result.group())  # <html><h1>abcdef</h1></html>

def func(temp):
    num = temp.group()
    num1 = int(num) + 1
    return str(num1)

result1 = re.sub(r'\\d+', func, "a:100,b:100")
result = re.sub(r'\\d+', "90", "a:100,b:100")
print(result, result1)  # a:90,b:90 a:101,b:101

result = re.split(r'[,:]', "a:100,b:100")
print(result)  # ['a', '100', 'b', '100']

# 贪婪和非贪婪
msg = "abc123abc"
result = re.match(r"abc(\\d+)", msg)  # match='abc1'>
print("默认是贪婪的：", result)  # 默认是贪婪的： <re.Match object; span=(0, 6), match='abc123'>

result = re.match(r"abc(\\d+?)", msg)  # match='abc1'
print("加问号变成非贪婪的", result)  # 加问号变成非贪婪的 <re.Match object; span=(0, 4), match='abc1'>

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

# 总结

 re模块：
        re.match()  从开头匹配，匹配一次就结束
        re.search()  只匹配一次就结束
        re.findall()  查找所有
        re.finditer() 在字符串中找到正则表达式所匹配的所有子串，并把它们作为一个迭代器返回。
        re.sub(正则表达式, "新内容（可以函数）", str)

	+  一次，无数次
    *  0，无数次
    ?  0，1次 (贪婪)

    .  任意字符
    [] 范围    [0-9]
    |  或者    a|b
    () 一组    group()分组	起名的方式： (?P=<名字>正则)   （?P=名字）
    {2} 匹配次数 {2,6} 2次或者6次 {2，} 2次到无数次
    ^	开头
    &	结尾

    原生字符串r

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

编辑

上次更新: 2023/05/17, 23:08:21

← hashlib 01.time→