Regex实践

2023-11-01  本文已影响0人  MrDecoder

[TOC]

#1. 电话号码

文本:

Pattern:

#include <Windows.h>
#include <regex>
#include <iostream>
#include <string>

int main()
{
    using namespace std;

    string text = "J. Doe: 248-555-1234  B. Smith: (313) 555-1234";
    regex expression("\\(?[2-9]\\d\\d\\)?[ -]?[2-9]\\d\\d-\\d{4}");

    smatch matches;
    string::const_iterator searchStart(text.cbegin());
    while (regex_search(searchStart, text.cend(), matches, expression))
    {
        cout << "matches for '" << text << "'\n";
        cout << "Prefix: '" << matches.prefix() << "'\n";
        for (size_t i = 0; i < matches.size(); ++i)
        {
            cout << i << ": " << matches[i] << '\n';
        }
        cout << "Suffix: '" << matches.suffix() << "\'\n\n";
        searchStart = matches.suffix().first;
    }
    return 0;
}

// matches for 'J. Doe: 248-555-1234  B. Smith: (313) 555-1234'
// Prefix: 'J. Doe: '
// 0: 248-555-1234
// Suffix: '  B. Smith: (313) 555-1234'

// matches for 'J. Doe: 248-555-1234  B. Smith: (313) 555-1234'
// Prefix: '  B. Smith: '
// 0: (313) 555-1234
// Suffix: ''

\(?匹配一个可选的左括号。接下来的[2-9]\d\d负责匹配一个3位数的区号(第1位数字只能是2到9)。\)?匹配一个可选的右括号。[ -]?匹配一个空格或连字符——这个字符也是可选的。[2-9]\d\d-\d{4}匹配电话号码的剩余部分:一个3位数的局号(第1位数字只能是2到9)、一个连字符和最后4位数字。

#2. 邮政编码

文本:

Pattern:

#include <Windows.h>
#include <regex>
#include <iostream>
#include <string>

int main()
{
    using namespace std;

    string text = "999 1st Avenue, Bigtown, NY, 11222 123 High Street, Any City, MI 48034-1234";
    regex expression("\\d{5}(-\\d{4})?");

    smatch matches;
    string::const_iterator searchStart(text.cbegin());
    while (regex_search(searchStart, text.cend(), matches, expression))
    {
        cout << "matches for '" << text << "'\n";
        cout << "Prefix: '" << matches.prefix() << "'\n";
        for (size_t i = 0; i < matches.size(); ++i)
        {
            cout << i << ": " << matches[i] << '\n';
        }
        cout << "Suffix: '" << matches.suffix() << "\'\n\n";
        searchStart = matches.suffix().first;
    }

    return 0;
}

// matches for '999 1st Avenue, Bigtown, NY, 11222 123 High Street, Any City, MI 48034-1234'
// Prefix: '999 1st Avenue, Bigtown, NY, '
// 0: 11222
// 1:
// Suffix: ' 123 High Street, Any City, MI 48034-1234'

// matches for '999 1st Avenue, Bigtown, NY, 11222 123 High Street, Any City, MI 48034-1234'
// Prefix: ' 123 High Street, Any City, MI '
// 0: 48034-1234
// 1: -1234
// Suffix: ''

\d{5}匹配任意5位数字,(-\d{4})?匹配一个连字符和后4位数字。因为后4位数字是可选的(通过?来表明这个子表达式最多只允许出现一次)。

#3. IP地址

文本:

Pattern:

#include <Windows.h>
#include <regex>
#include <iostream>
#include <string>

int main()
{
    using namespace std;

    string text = "localhost is 127.0.0.1.";
    regex expression("(((\\d{1,2})|(1\\d{2})|(2[0-4]\\d)|(25[0-5]))\\.){3}((\\d{1,2})|(1\\d{2})|(2[0-4]\\d)|(25[0-5]))");

    smatch matches;
    string::const_iterator searchStart(text.cbegin());
    while (regex_search(searchStart, text.cend(), matches, expression))
    {
        cout << "matches for '" << text << "'\n";
        cout << "Prefix: '" << matches.prefix() << "'\n";
        for (size_t i = 0; i < matches.size(); ++i)
        {
            cout << i << ": " << matches[i] << '\n';
        }
        cout << "Suffix: '" << matches.suffix() << "\'\n\n";
        searchStart = matches.suffix().first;
    }

    return 0;
}

// matches for 'localhost is 127.0.0.1.'
// Prefix: 'localhost is '
// 0: 127.0.0.1
// 1: 0.
// 2: 0
// 3: 0
// 4:
// 5:
// 6:
// 7: 1
// 8: 1
// 9:
// 10:
// 11:
// Suffix: '.'

这个模式使用了一系列嵌套子表达式。(((\d{1,2})|(1\d{2})|(2[0-4]\d)|(25[0-5]))\.)(\d{1,2})匹配任意一位或两位数字(099);**(1\d{2})**匹配以1开头的任意三位数字(100199);(2[0-4]\d)匹配整数200249;**(25[0-5])**匹配整数250255。这几个子表达式通过|操作符结合为一个更大的子表达式(其含义是只须匹配这4个子表达式之一即可)。随后的\.用来匹配.字符,它与前4个子表达式构成的子表达式又构成了一个更大的子表达式,而接下来的{3}表明需要重复3次。最后,数值范围又重复了一次(这次省略了尾部的.)以匹配IP地址里的最后一组数字。通过把4组以.分隔的数字的取值范围都限制在0~255之间,这个模式准确无误地做到了只匹配合法的IP地址,但不匹配非法的IP地址。

#4. URL地址

文本:

Pattern:

#include <Windows.h>
#include <regex>
#include <iostream>
#include <string>

int main()
{
    using namespace std;

    string text = "http://www.forta.com/blog https://www.forta.com:80/blog/index.cfm http://www.forta.com http://localhost:8500/";
    regex expression("https?://(\\w*:\\w*@)?[-\\w.]+(:\\d+)?(/([\\w/_.]*(\\?\\S+)?)?)?");

    smatch matches;
    string::const_iterator searchStart(text.cbegin());
    while (regex_search(searchStart, text.cend(), matches, expression))
    {
        cout << "matches for '" << text << "'\n";
        cout << "Prefix: '" << matches.prefix() << "'\n";
        for (size_t i = 0; i < matches.size(); ++i)
        {
            cout << i << ": " << matches[i] << '\n';
        }
        cout << "Suffix: '" << matches.suffix() << "\'\n\n";
        searchStart = matches.suffix().first;
    }

    return 0;
}

// matches for 'http://www.forta.com/blog https://www.forta.com:80/blog/index.cfm http://www.forta.com 
// http://localhost:8500/'
// Prefix: ''
// 0: http://www.forta.com/blog
// 1:
// 2:
// 3: /blog
// 4: blog
// 5:
// Suffix: ' https://www.forta.com:80/blog/index.cfm http://www.forta.com http://localhost:8500/'

// matches for 'http://www.forta.com/blog https://www.forta.com:80/blog/index.cfm http://www.forta.com 
// http://localhost:8500/'
// Prefix: ' '
// 0: https://www.forta.com:80/blog/index.cfm
// 1:
// 2: :80
// 3: /blog/index.cfm
// 4: blog/index.cfm
// 5:
// Suffix: ' http://www.forta.com http://localhost:8500/'

// matches for 'http://www.forta.com/blog https://www.forta.com:80/blog/index.cfm http://www.forta.com 
// http://localhost:8500/'
// Prefix: ' '
// 0: http://www.forta.com
// 1:
// 2:
// 3:
// 4:
// 5:
// Suffix: ' http://localhost:8500/'

// matches for 'http://www.forta.com/blog https://www.forta.com:80/blog/index.cfm http://www.forta.com 
// http://ben:password@www.forta.com/ http://localhost:8500/'
// Prefix: ' '
// 0: http://ben:password@www.forta.com/
// 1: ben:password@
// 2:
// 3: /
// 4:
// 5:
// Suffix: ' http://localhost:8500/'
    
// matches for 'http://www.forta.com/blog https://www.forta.com:80/blog/index.cfm http://www.forta.com 
// http://localhost:8500/'
// Prefix: ' '
// 0: http://localhost:8500/
// 1:
// 2: :8500
// 3: /
// 4:
// 5:
// Suffix: ''

https?://后面的是(\w*:\w*@)?,它将匹配嵌在URL字符串里的用户名和口令字(用户名和口令字要用:隔开,它们的后面还跟着一个@字符)。子表达式(\?\S+)?负责匹配查询字符串。查询字符串是在URL字符串里出现在?后面的文本,这些文本是可选的。

#5. 电子邮件地址

文本:

Pattern:

#include <Windows.h>
#include <regex>
#include <iostream>
#include <string>

int main()
{
    using namespace std;

    string text = "My name is Ben Forta, and my email address is ben@forta.com.";
    regex expression("(\\w+\\.)*\\w+@(\\w+\\.)+[A-Za-z]+");

    smatch matches;
    string::const_iterator searchStart(text.cbegin());
    while (regex_search(searchStart, text.cend(), matches, expression))
    {
        cout << "matches for '" << text << "'\n";
        cout << "Prefix: '" << matches.prefix() << "'\n";
        for (size_t i = 0; i < matches.size(); ++i)
        {
            cout << i << ": " << matches[i] << '\n';
        }
        cout << "Suffix: '" << matches.suffix() << "\'\n\n";
        searchStart = matches.suffix().first;
    }

    return 0;
}

// matches for 'My name is Ben Forta, and my email address is ben@forta.com.'
// Prefix: 'My name is Ben Forta, and my email address is '
// 0: ben@forta.com
// 1:
// 2: forta.
// Suffix: '.'

\w+\.)*\w+负责匹配电子邮件地址里的用户名部分(@之前的所有文本):(\w+\.)*匹配一些由.结束的文本的零次或多次重复出现,\w+匹配必不可少的文本(这个组合将匹配ben和ben.forta等)。接下里,@匹配@字符本身,(\w+\.)匹配一个以.结束的字符串,[A-Za-z]+匹配顶级域名(com、edu、us或uk,等等)。

上一篇下一篇

猜你喜欢

热点阅读