中文数字转换为阿拉伯数字

2019-11-20  本文已影响0人  momo1023
import regex as re

def number_translator(target):
    '''
    该方法可以将字符串中所有的用汉字表示的数字转化为用阿拉伯数字表示的数字
    如"一千两百,六百零五"可以转化为"1200,605"
    此外添加支持了部分不规则表达方法:
    如两万零六百五可转化为20650
    两百一十四和两百十四都可以转化为214
    一六零加一五八可以转化为160+158
    该方法目前支持的正确转化范围是0-99999999
    
    :param target: 待转化的字符串
    :return: 转化完毕后的字符串
    '''
    
    def word2number(s):
        '''
        可将[零-九]正确翻译为[0-9]

        :param s: 大写数字
        :return: 对应的整形数,如果不是数字返回-1
        '''
        if (s == u'零') or (s == '0'):
            return 0
        elif (s == u'一') or (s == '1') or (s == u'壹'):
            return 1
        elif (s == u'二') or (s == '2') or (s == u'贰') or (s == u'两'):
            return 2
        elif (s == u'三') or (s == '3') or (s == u'叁'):
            return 3
        elif (s == u'四') or (s == '4') or (s == u'肆'):
            return 4
        elif (s == u'五') or (s == '5') or (s == u'伍'):
            return 5
        elif (s == u'六') or (s == '6') or (s == u'陆'):
            return 6
        elif (s == u'七') or (s == '7') or (s == u'柒') or (s == u'天') or (s == u'日') or (s == u'末'):
            return 7
        elif (s == u'八') or (s == '8') or (s == u'捌'):
            return 8
        elif (s == u'九') or (s == '9') or (s == u'玖'):
            return 9
    #     elif (s == u'十') or (s == u'拾'):
    #         return 10
    #     elif (s == u'百') or (s == u'佰'):
    #         return 100
    #     elif (s == u'千') or (s == u'仟'):
    #         return 1000
    #     elif (s == u'万') or (s == u'萬'):
    #         return 10000
    #     elif (s == u'亿'):
    #         return 100000000
        else:
            return -1
        
    def str2int(s):
        '''
        将字符数字转换为int
        '''
        try:
            res = int(s)
        except:
            res = 0
        return res
    
    pattern = re.compile(u"[一二两三四五六七八九123456789]万[一二两三四五六七八九123456789](?!(亿|千|百|十))")
    match = pattern.finditer(target)
    for m in match:
        group = m.group()
        s = group.split(u"万")
        s = list(s)
        num = 0
        if len(s) == 2:
            num += word2number(s[0]) * 10000 + word2number(s[1]) * 1000
        target = pattern.sub(str(num), target, 1)
#     print(target)
    
    pattern = re.compile(u"[一二两三四五六七八九123456789]万[一二两三四五六七八九123456789](?!(千|百|十))")
    match = pattern.finditer(target)
    for m in match:
        group = m.group()
        s = group.split(u"万")
        s = list(s)
        num = 0
        if len(s) == 2:
            num += word2number(s[0]) * 10000 + word2number(s[1]) * 1000
        target = pattern.sub(str(num), target, 1)
#     print(target)

    pattern = re.compile(u"[一二两三四五六七八九123456789]千[一二两三四五六七八九123456789](?!(百|十))")
    match = pattern.finditer(target)
    for m in match:
        group = m.group()
        s = group.split(u"千")
        s = list(filter(None, s))
        num = 0
        if len(s) == 2:
            num += word2number(s[0]) * 1000 + word2number(s[1]) * 100
        target = pattern.sub(str(num), target, 1)
#     print(target)

    pattern = re.compile(u"[一二两三四五六七八九123456789]百[一二两三四五六七八九123456789](?!十)")
    match = pattern.finditer(target)
    for m in match:
        group = m.group()
        s = group.split(u"百")
        s = list(filter(None, s))
        num = 0
        if len(s) == 2:
            num += word2number(s[0]) * 100 + word2number(s[1]) * 10
        target = pattern.sub(str(num), target, 1)
#     print(target)

    pattern = re.compile(u"[零一二两三四五六七八九]")
    match = pattern.finditer(target)
    for m in match:
        target = pattern.sub(str(word2number(m.group())), target, 1)
#     print(target)

    pattern = re.compile(u"(?<=(周|星期|天|日))[天|日|末]")
    match = pattern.finditer(target)
    for m in match:
        target = pattern.sub(str(word2number(m.group())), target, 1)
#     print(target)

    pattern = re.compile(u"(?<!(周|星期))0?[0-9]?十[0-9]?")
    match = pattern.finditer(target)
    for m in match:
        group = m.group()
        s = group.split(u"十")
        num = 0
        ten = str2int(s[0])
        if ten == 0:
            ten = 1
        unit = str2int(s[1])
        num = ten * 10 + unit
        target = pattern.sub(str(num), target, 1)
#     print(target)

    pattern = re.compile(u"0?[1-9]百[0-9]?[0-9]?")
    match = pattern.finditer(target)
    for m in match:
        group = m.group()
        s = group.split(u"百")
        s = list(filter(None, s))
        num = 0
        if len(s) == 1:
            hundred = int(s[0])
            num += hundred * 100
        elif len(s) == 2:
            hundred = int(s[0])
            num += hundred * 100
            num += int(s[1])
        target = pattern.sub(str(num), target, 1)
#     print(target)

    pattern = re.compile(u"0?[1-9]千[0-9]?[0-9]?[0-9]?")
    match = pattern.finditer(target)
    for m in match:
        group = m.group()
        s = group.split(u"千")
        s = list(filter(None, s))
        num = 0
        if len(s) == 1:
            thousand = int(s[0])
            num += thousand * 1000
        elif len(s) == 2:
            thousand = int(s[0])
            num += thousand * 1000
            num += int(s[1])
        target = pattern.sub(str(num), target, 1)
#     print(target)

    pattern = re.compile(u"[0-9]+万[0-9]?[0-9]?[0-9]?[0-9]?")
    match = pattern.finditer(target)
    for m in match:
        group = m.group()
        s = group.split(u"万")
        s = list(filter(None, s))
        num = 0
        if len(s) == 1:
            tenthousand = int(s[0])
            num += tenthousand * 10000
        elif len(s) == 2:
            tenthousand = int(s[0])
            num += tenthousand * 10000
            num += int(s[1])
        target = pattern.sub(str(num), target, 1)
#     print(target)
    
    pattern = re.compile(u"[0-9]+亿[0-9]?[0-9]?[0-9]?[0-9]?[0-9]?[0-9]?[0-9]?[0-9]?")
    match = pattern.finditer(target)
    for m in match:
        group = m.group()
        s = group.split(u"亿")
        s = list(filter(None, s))
        num = 0
        if len(s) == 1:
            tenthousand = int(s[0])
            num += tenthousand * 100000000
        elif len(s) == 2:
            tenthousand = int(s[0])
            num += tenthousand * 100000000
            num += int(s[1])
        target = pattern.sub(str(num), target, 1)
#     print(target)

    return target

例子:

target = '二十亿一百万一千零两百,二千零三十三,三个星期.三天,三十日,星期天,周末, 十万亿'
print(number_translator(target))

输出:

2001001200,2033,3个星期.3天,30日,星期7,周7, 10000000000000
上一篇下一篇

猜你喜欢

热点阅读