import re
help(re.match)

Help on function match in module re:

match(pattern, string, flags=0)
    Try to apply the pattern at the start of the string, returning
    a Match object, or None if no match was found.

# 模式匹配字符串1的开头，并返回匹配对象。
pat = 'cat'
string1 = 'cat on mat'
string2 = 'raning cats and dogs'
re.match(pat, string1)

<re.Match object; span=(0, 3), match='cat'>

# 模式匹配字符串2，但不在开头，因此匹配失败并返回None。
re.match(pat, string2) is None

True

help(re.search)

Help on function search in module re:

search(pattern, string, flags=0)
    Scan through string looking for a match to the pattern, returning
    a Match object, or None if no match was found.

# 模式匹配字符串1的开头，并返回匹配对象。
pat = 'cat'
string1 = 'cat on mat'
string2 = 'raining cats and dogs'
string3 = 'abracadabra'
re.search(pat, string1)

<re.Match object; span=(0, 3), match='cat'>

# 模式匹配字符串2（不在开头！）并返回匹配对象。
re.search(pat, string2)

<re.Match object; span=(8, 11), match='cat'>

# 模式在字符串3中不匹配任何内容，返回None。
re.search(pat, string3) is None

True

help(re.findall)

Help on function findall in module re:

findall(pattern, string, flags=0)
    Return a list of all non-overlapping matches in the string.
    
    If one or more capturing groups are present in the pattern, return
    a list of groups; this will be a list of tuples if the pattern
    has more than one group.
    
    Empty matches are included in the result.

# 模式在字符串1中匹配一次，返回该匹配。
pat = 'cat'
string1 = 'cat on mat'
string2 = 'one cat, two cats, three cats'
string3 = 'abracadabra'
re.findall(pat, string1)

['cat']

# 模式在字符串2中匹配三次；返回三个实例的列表。
re.findall(pat, string2)

['cat', 'cat', 'cat']

# 模式在字符串3中不匹配任何内容，返回空列表。
re.findall(pat, string3)

[]

# 在 Python 中，反斜杠 (\) 在字符串字面量中用于转义特殊字符。例如，\n 表示换行符，\t 表示制表符等。
# 但是，在正则表达式中，反斜杠 (\) 也用于转义特殊字符。例如，\d 表示数字，\w 表示单词字符等。
# 问题在于，当你在 Python 字符串字面量中写正则表达式时，反斜杠 (\) 的含义会冲突。
# 例如，如果你想匹配一个字面上的反斜杠，你需要写 \\\\ 作为模式字符串，因为：
# 在正则表达式中，反斜杠 (\) 表示转义特殊字符，所以需要写 \\ 来表示一个字面上的反斜杠。
# 在 Python 字符串字面量中，反斜杠 (\) 也表示转义特殊字符，所以需要写 \\ 来表示一个字面上的反斜杠。
# 因此，为了表示一个字面上的反斜杠，你需要写 \\\\，这是因为反斜杠 (\) 的含义在 Python 字符串字面量和正则表达式中都需要被转义。
re.match('\\\\\$', '\$')

<re.Match object; span=(0, 2), match='\\$'>

re.match(r'\\\$', '\$')

<re.Match object; span=(0, 2), match='\\$'>

beatles = "hello\ngoodbye"
re.findall(r'\n', beatles)

['\n']

# “这很复杂，很难理解，所以强烈建议你们使用原始字符串来表达所有但最简单的表达式。”
re.findall('\\n', beatles)

['\n']

re.findall('\\\n', beatles)

['\n']

re.findall(r'$2', "2$2")

[]

re.findall(r'\$2', "2$2")

['$2']

# 示例：行的开头和结尾，通配符
# ‘.’匹配‘a’，并且开始和结束行正确匹配。
pat = r'^b.d$' 
re.findall(pat, 'bad')

['bad']

# ‘.’匹配‘i’，并且开始和结束行正确匹配。
re.findall(pat, 'bid')

['bid']

# 匹配失败是因为字符串末尾的‘s’，这意味着‘d’后面不是行尾。
re.findall(pat, 'bids')

[]

# 匹配失败是因为字符串开头的‘a’，这意味着‘b’不是字符串的开头。
re.findall(pat, 'abad')

[]

string1 = 'c\ta t\ns\n'
print(string1)

c	a t
s

# ‘\s’匹配任何空白。包括空格，制表符和新行。
re.findall(r'\s', string1)

['\t', ' ', '\n', '\n']

# 因为它后面没有空白-单词边界。
print(re.findall(r'hello\b', 'helloworld!'))
print(re.findall(r'hello\b', "hello world!"))

[]
['hello']

# ‘\S’：匹配任何非空白
re.findall(r'\S', string1)

['c', 'a', 't', 's']

# ‘\D’：匹配任何非数字字符
re.findall(r'\D', string1)

['c', '\t', 'a', ' ', 't', '\n', 's', '\n']

# ‘\W’：匹配任何非单词字符
re.findall(r'\W', "abc123 \t\n_$*.")

[' ', '\t', '\n', '$', '*', '.']

# ‘\B’：匹配不在单词边界的
re.findall(r'\B\d\B', "1 2X a3 747")

['4']

print(re.findall(r'hello\B', 'helloworld!'))
print(re.findall(r'hello\B', "hello world!"))

['hello']
[]

# ‘*’：前一个项目的零个或多个
re.findall(r'ca*t', "ct cat caat caaat")

['ct', 'cat', 'caat', 'caaat']

# ‘+’：前一个项目的一或多个
re.findall(r'ca+t', "ct cat caat caaat")

['cat', 'caat', 'caaat']

# ‘?’：前一个项目的零个或一个
re.findall(r'ca?t', "ct cat caat caaat")

['ct', 'cat']

# ‘{2}’：确切四个前一个项目
re.findall(r'ca{2}t', "ct cat caat caaat")

['caat']

# ‘{1,2}’：前一个项目的两个到五个（包括）
re.findall(r'ca{1,2}t', "ct cat caat caaat")

['cat', 'caat']

re.findall(r'^\d{2,4}\s', '7 a1'), re.findall(r'^\d{2,4}\s', '747 Boeing'), re.findall(r'^\d{2,4}\s', 'C7777 C7778')

([], ['747 '], [])

re.findall(r'^\d{2,4}\s', '12345 '), re.findall(r'^\d{2,4}\s', '1234\tqq'), re.findall(r'^\d{2,4}\s', 'Boeing 747')

([], ['1234\t'], [])

import re
print(re.findall(r'cat|dog', "cat"))
print(re.findall(r'cat|dog', "dog"))
print(re.findall(r'cat|dog', "cat\ndog"))

['cat']
['dog']
['cat', 'dog']

re.findall(r'a|aa|aaa', "aaaa")

['a', 'a', 'a', 'a']

# ‘a+’吞噬了整个字符串，因为Python正则表达式是贪婪的。
re.findall(r'a+', 'aaaaaa')

['aaaaaa']

# ‘?’修改操作符如‘+’和‘*’不贪婪，我们得到懒惰匹配，就像使用‘|’一样。
re.findall(r'a+?', 'aaaaaa')

['a', 'a', 'a', 'a', 'a', 'a']

# 示例：匹配电子邮件地址中的用户和域名
string1 = "My USC email is dpj@usc.edu.cn"
m = re.search(r'([\w.-]+)@([\w.-]+)', string1)
# ‘re.search’返回一个匹配对象。组属性是被匹配的整个字符串。
m.group()

'dpj@usc.edu.cn'

# 可以按顺序访问组（正则表达式中的括号部分）的数字顺序。每组括号获得一个组，从左到右。
# re.findall具有类似的功能！
m.group(1)

'dpj'

m.group(2)

'usc.edu.cn'

m = re.search(r'(\S+) \1', 'cat cat')
m.group()

'cat cat'

m = re.search(r'(\S+) \1', 'cat dog')
m is None

True

re.search(r'dog', 'DOG', re.IGNORECASE)

<re.Match object; span=(0, 3), match='DOG'>

# regex = re.compile(r'cat|dog|bird', re.DEBUG)
regex = re.compile(r'cat|dog|bird')
regex.findall("It's raining cats and dogs")

['cat', 'dog']

regex.match("cat bird dog")

<re.Match object; span=(0, 3), match='cat'>

regex.search("nothing to see here.") is None

True

import re
# 原始日期字符串
date_string = "2024-05-01\n2024/05/02\n2024.05.03\n2024/05/04\n2024.05.05"
# 正则表达式匹配 YYYY-MM-DD, YYYY/MM/DD, YYYY.MM.DD 格式
pattern = r'(\d{4})[-/\.](\d{2})[-/\.](\d{2})'
# 替换为 MM/DD/YYYY 格式
formatted_dates = re.sub(pattern, r'\2-\3-\1', date_string)
print(formatted_dates)

05-01-2024
05-02-2024
05-03-2024
05-04-2024
05-05-2024

* 原始文本
    * "联系我：123-456-7890 或者 987.654.3210."
* 替换格式：将电话号码格式化为 (XXX) XXX-XXXX

import re
# 原始文本
text = "联系我：123-456-7890 或者 987.654.3210."
# 正则表达式匹配电话号码
pattern = r'(\d{3})[-.](\d{3})[-.](\d{4})'
# 替换格式：将电话号码格式化为 (XXX) XXX-XXXX
formatted_text = re.sub(pattern, r'(\1) \2-\3', text)
print(formatted_text)

联系我：(123) 456-7890 或者 (987) 654-3210.

Python数据处理¶

06. 文本编码和正则表达式¶

结构化数据¶

文本数据无处不在¶

文本数据是如何存储的？¶

文本数据是如何存储的？¶

ASCII（美国信息交换标准代码）¶

ASCII 表格¶

注意！¶

Unicode¶

匹配文本：正则表达式（“regexes”）¶

Python中的正则表达式：re包¶

那么更复杂的匹配呢？¶

天哪，那真是很多反斜杠...¶

回想一下¶

特殊字符：基础¶

特殊字符：集合和范围¶

特殊字符：单字符匹配¶

示例：空白和边界¶

字符类别：补集¶

匹配和重复¶

测试你的理解¶

或条款：`|`¶

或条款：`|` 是懒惰的！¶

匹配和贪婪¶

提取组¶

后引用¶

后引用¶

后引用¶

Python re模块提供的选项¶

调试¶

捕获替换¶

Python数据处理¶

06. 文本编码和正则表达式¶

结构化数据¶

文本数据无处不在¶

文本数据是如何存储的？¶

文本数据是如何存储的？¶

ASCII（美国信息交换标准代码）¶

ASCII 表格¶

注意！¶

Unicode¶

匹配文本：正则表达式（“regexes”）¶

Python中的正则表达式：re包¶

那么更复杂的匹配呢？¶

天哪，那真是很多反斜杠...¶

回想一下¶

特殊字符：基础¶

特殊字符：集合和范围¶

特殊字符：单字符匹配¶

示例：空白和边界¶

字符类别：补集¶

匹配和重复¶

测试你的理解¶

或条款：|¶

或条款：| 是懒惰的！¶

匹配和贪婪¶

提取组¶

后引用¶

后引用¶

后引用¶

Python re模块提供的选项¶

调试¶

捕获替换¶

或条款：`|`¶

或条款：`|` 是懒惰的！¶