python - 有没有办法将数字单词转换为整数？

Question

我需要转换one成1, twointo2等等。

有没有办法用一个库或一个类或任何东西来做到这一点？

score 137 · Accepted Answer

这段代码的大部分是设置 numwords 字典，这仅在第一次调用时完成。

def text2int(textnum, numwords={}):
    if not numwords:
      units = [
        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen",
      ]

      tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

      scales = ["hundred", "thousand", "million", "billion", "trillion"]

      numwords["and"] = (1, 0)
      for idx, word in enumerate(units):    numwords[word] = (1, idx)
      for idx, word in enumerate(tens):     numwords[word] = (1, idx * 10)
      for idx, word in enumerate(scales):   numwords[word] = (10 ** (idx * 3 or 2), 0)

    current = result = 0
    for word in textnum.split():
        if word not in numwords:
          raise Exception("Illegal word: " + word)

        scale, increment = numwords[word]
        current = current * scale + increment
        if scale > 100:
            result += current
            current = 0

    return result + current

print text2int("seven billion one hundred million thirty one thousand three hundred thirty seven")
#7100031337

score 33 · Accepted Answer

我刚刚为 PyPI 发布了一个名为word2number的 python 模块，用于确切目的。https://github.com/akshaynagpal/w2n

安装它使用：

pip install word2number

确保您的 pip 已更新到最新版本。

用法：

from word2number import w2n

print w2n.word_to_num("two million three thousand nine hundred and eighty four")
2003984

score 16 · Accepted Answer

如果有人感兴趣，我破解了一个维护字符串其余部分的版本（尽管它可能有错误，但没有对其进行过多测试）。

def text2int (textnum, numwords={}):
    if not numwords:
        units = [
        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen",
        ]

        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

        scales = ["hundred", "thousand", "million", "billion", "trillion"]

        numwords["and"] = (1, 0)
        for idx, word in enumerate(units):  numwords[word] = (1, idx)
        for idx, word in enumerate(tens):       numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)

    ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
    ordinal_endings = [('ieth', 'y'), ('th', '')]

    textnum = textnum.replace('-', ' ')

    current = result = 0
    curstring = ""
    onnumber = False
    for word in textnum.split():
        if word in ordinal_words:
            scale, increment = (1, ordinal_words[word])
            current = current * scale + increment
            if scale > 100:
                result += current
                current = 0
            onnumber = True
        else:
            for ending, replacement in ordinal_endings:
                if word.endswith(ending):
                    word = "%s%s" % (word[:-len(ending)], replacement)

            if word not in numwords:
                if onnumber:
                    curstring += repr(result + current) + " "
                curstring += word + " "
                result = current = 0
                onnumber = False
            else:
                scale, increment = numwords[word]

                current = current * scale + increment
                if scale > 100:
                    result += current
                    current = 0
                onnumber = True

    if onnumber:
        curstring += repr(result + current)

    return curstring

例子：

 >>> text2int("I want fifty five hot dogs for two hundred dollars.")
 I want 55 hot dogs for 200 dollars.

如果您有“200 美元”，则可能会出现问题。但是，这真的很粗糙。

score 15 · Accepted Answer

我需要一些不同的东西，因为我的输入来自语音到文本的转换，而解决方案并不总是对数字求和。例如，“我的邮政编码是一二三四五”不应转换为“我的邮政编码是 15”。

我接受了 Andrew 的回答并对其进行了调整，以处理其他一些人们突出显示为错误的情况，并且还添加了对我上面提到的邮政编码等示例的支持。一些基本的测试用例如下所示，但我相信仍有改进的空间。

def is_number(x):
    if type(x) == str:
        x = x.replace(',', '')
    try:
        float(x)
    except:
        return False
    return True

def text2int (textnum, numwords={}):
    units = [
        'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
        'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
        'sixteen', 'seventeen', 'eighteen', 'nineteen',
    ]
    tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
    scales = ['hundred', 'thousand', 'million', 'billion', 'trillion']
    ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
    ordinal_endings = [('ieth', 'y'), ('th', '')]

    if not numwords:
        numwords['and'] = (1, 0)
        for idx, word in enumerate(units): numwords[word] = (1, idx)
        for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)

    textnum = textnum.replace('-', ' ')

    current = result = 0
    curstring = ''
    onnumber = False
    lastunit = False
    lastscale = False

    def is_numword(x):
        if is_number(x):
            return True
        if word in numwords:
            return True
        return False

    def from_numword(x):
        if is_number(x):
            scale = 0
            increment = int(x.replace(',', ''))
            return scale, increment
        return numwords[x]

    for word in textnum.split():
        if word in ordinal_words:
            scale, increment = (1, ordinal_words[word])
            current = current * scale + increment
            if scale > 100:
                result += current
                current = 0
            onnumber = True
            lastunit = False
            lastscale = False
        else:
            for ending, replacement in ordinal_endings:
                if word.endswith(ending):
                    word = "%s%s" % (word[:-len(ending)], replacement)

            if (not is_numword(word)) or (word == 'and' and not lastscale):
                if onnumber:
                    # Flush the current number we are building
                    curstring += repr(result + current) + " "
                curstring += word + " "
                result = current = 0
                onnumber = False
                lastunit = False
                lastscale = False
            else:
                scale, increment = from_numword(word)
                onnumber = True

                if lastunit and (word not in scales):                                                                                                                                                                                                                                         
                    # Assume this is part of a string of individual numbers to                                                                                                                                                                                                                
                    # be flushed, such as a zipcode "one two three four five"                                                                                                                                                                                                                 
                    curstring += repr(result + current)                                                                                                                                                                                                                                       
                    result = current = 0                                                                                                                                                                                                                                                      

                if scale > 1:                                                                                                                                                                                                                                                                 
                    current = max(1, current)                                                                                                                                                                                                                                                 

                current = current * scale + increment                                                                                                                                                                                                                                         
                if scale > 100:                                                                                                                                                                                                                                                               
                    result += current                                                                                                                                                                                                                                                         
                    current = 0                                                                                                                                                                                                                                                               

                lastscale = False                                                                                                                                                                                                              
                lastunit = False                                                                                                                                                
                if word in scales:                                                                                                                                                                                                             
                    lastscale = True                                                                                                                                                                                                         
                elif word in units:                                                                                                                                                                                                             
                    lastunit = True

    if onnumber:
        curstring += repr(result + current)

    return curstring

一些测试...

one two three -> 123
three forty five -> 345
three and forty five -> 3 and 45
three hundred and forty five -> 345
three hundred -> 300
twenty five hundred -> 2500
three thousand and six -> 3006
three thousand six -> 3006
nineteenth -> 19
twentieth -> 20
first -> 1
my zip is one two three four five -> my zip is 12345
nineteen ninety six -> 1996
fifty-seventh -> 57
one million -> 1000000
first hundred -> 100
I will buy the first thousand -> I will buy the 1000  # probably should leave ordinal in the string
thousand -> 1000
hundred and six -> 106
1 million -> 1000000

score 12 · Accepted Answer

我需要处理一些额外的解析案例，例如序数词（“first”、“second”）、连字符（“one-hundred”）和连字符的序词（如“fifty-7”），所以我添加了几行：

def text2int(textnum, numwords={}):
    if not numwords:
        units = [
        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen",
        ]

        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

        scales = ["hundred", "thousand", "million", "billion", "trillion"]

        numwords["and"] = (1, 0)
        for idx, word in enumerate(units):  numwords[word] = (1, idx)
        for idx, word in enumerate(tens):       numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)

    ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
    ordinal_endings = [('ieth', 'y'), ('th', '')]

    textnum = textnum.replace('-', ' ')

    current = result = 0
    for word in textnum.split():
        if word in ordinal_words:
            scale, increment = (1, ordinal_words[word])
        else:
            for ending, replacement in ordinal_endings:
                if word.endswith(ending):
                    word = "%s%s" % (word[:-len(ending)], replacement)

            if word not in numwords:
                raise Exception("Illegal word: " + word)

            scale, increment = numwords[word]
        
         current = current * scale + increment
         if scale > 100:
            result += current
            current = 0

    return result + current`

score 6 · Accepted Answer

这是简单的案例方法：

>>> number = {'one':1,
...           'two':2,
...           'three':3,}
>>> 
>>> number['two']
2

还是您在寻找可以处理“一万两千，一百七十二”的东西？

score 3 · Accepted Answer

如果您要解析的数字数量有限，则可以轻松地将其硬编码到字典中。

对于稍微复杂的情况，您可能希望根据相对简单的数字语法自动生成此字典。与此类似的东西（当然，广义的......）

for i in range(10):
   myDict[30 + i] = "thirty-" + singleDigitsDict[i]

如果您需要更广泛的东西，那么看起来您将需要自然语言处理工具。这篇文章可能是一个很好的起点。

score 3 · Accepted Answer

使用 Python 包：WordToDigits

pip install wordtodigits

它可以在句子中找到以单词形式出现的数字，然后将它们转换为正确的数字格式。如果存在小数部分，还需要处理。数字的单词表示可以在文章中的任何地方。

score 3 · Accepted Answer

def parse_int(string):
    ONES = {'zero': 0,
            'one': 1,
            'two': 2,
            'three': 3,
            'four': 4,
            'five': 5,
            'six': 6,
            'seven': 7,
            'eight': 8,
            'nine': 9,
            'ten': 10,
            'eleven': 11,
            'twelve': 12,
            'thirteen': 13,
            'fourteen': 14,
            'fifteen': 15,
            'sixteen': 16,
            'seventeen': 17,
            'eighteen': 18,
            'nineteen': 19,
            'twenty': 20,
            'thirty': 30,
            'forty': 40,
            'fifty': 50,
            'sixty': 60,
            'seventy': 70,
            'eighty': 80,
            'ninety': 90,
              }

    numbers = []
    for token in string.replace('-', ' ').split(' '):
        if token in ONES:
            numbers.append(ONES[token])
        elif token == 'hundred':
            numbers[-1] *= 100
        elif token == 'thousand':
            numbers = [x * 1000 for x in numbers]
        elif token == 'million':
            numbers = [x * 1000000 for x in numbers]
    return sum(numbers)

用 700 个 1 到 100 万范围内的随机数进行测试效果很好。

score 1 · Accepted Answer

进行了更改，以便 text2int(scale) 将返回正确的转换。例如，text2int("hundred") => 100。

import re

numwords = {}


def text2int(textnum):

    if not numwords:

        units = [ "zero", "one", "two", "three", "four", "five", "six",
                "seven", "eight", "nine", "ten", "eleven", "twelve",
                "thirteen", "fourteen", "fifteen", "sixteen", "seventeen",
                "eighteen", "nineteen"]

        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", 
                "seventy", "eighty", "ninety"]

        scales = ["hundred", "thousand", "million", "billion", "trillion", 
                'quadrillion', 'quintillion', 'sexillion', 'septillion', 
                'octillion', 'nonillion', 'decillion' ]

        numwords["and"] = (1, 0)
        for idx, word in enumerate(units): numwords[word] = (1, idx)
        for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)

    ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 
            'eighth':8, 'ninth':9, 'twelfth':12}
    ordinal_endings = [('ieth', 'y'), ('th', '')]
    current = result = 0
    tokens = re.split(r"[\s-]+", textnum)
    for word in tokens:
        if word in ordinal_words:
            scale, increment = (1, ordinal_words[word])
        else:
            for ending, replacement in ordinal_endings:
                if word.endswith(ending):
                    word = "%s%s" % (word[:-len(ending)], replacement)

            if word not in numwords:
                raise Exception("Illegal word: " + word)

            scale, increment = numwords[word]

        if scale > 1:
            current = max(1, current)

        current = current * scale + increment
        if scale > 100:
            result += current
            current = 0

    return result + current

score 1 · Accepted Answer

Marc Burns的红宝石可以做到这一点。我最近分叉了它以增加多年的支持。您可以从 python 调用ruby 代码。

  require 'numbers_in_words'
  require 'numbers_in_words/duck_punch'

  nums = ["fifteen sixteen", "eighty five sixteen",  "nineteen ninety six",
          "one hundred and seventy nine", "thirteen hundred", "nine thousand two hundred and ninety seven"]
  nums.each {|n| p n; p n.in_numbers}

结果：
"fifteen sixteen" 1516 "eighty five sixteen" 8516 "nineteen ninety six" 1996 "one hundred and seventy nine" 179 "thirteen hundred" 1300 "nine thousand two hundred and ninety seven" 9297

score 0 · Accepted Answer

一个快速的解决方案是使用inflect.py生成字典进行翻译。

inflect.py 有一个number_to_words()函数，它将一个数字（例如2）转换为它的单词形式（例如'two'）。不幸的是，没有提供它的反向（这将允许您避免使用翻译词典路线）。同样，您可以使用该功能来构建翻译词典：

>>> import inflect
>>> p = inflect.engine()
>>> word_to_number_mapping = {}
>>>
>>> for i in range(1, 100):
...     word_form = p.number_to_words(i)  # 1 -> 'one'
...     word_to_number_mapping[word_form] = i
...
>>> print word_to_number_mapping['one']
1
>>> print word_to_number_mapping['eleven']
11
>>> print word_to_number_mapping['forty-three']
43

如果您愿意花一些时间，可能会检查 inflect.py 函数的内部工作原理number_to_words()并构建您自己的代码以动态执行此操作（我没有尝试过这样做）。

score 0 · Accepted Answer

我采用了@recursive 的逻辑并转换为 Ruby。我还对查找表进行了硬编码，因此它并不那么酷，但可能会帮助新手了解正在发生的事情。

WORDNUMS = {"zero"=> [1,0], "one"=> [1,1], "two"=> [1,2], "three"=> [1,3],
            "four"=> [1,4], "five"=> [1,5], "six"=> [1,6], "seven"=> [1,7], 
            "eight"=> [1,8], "nine"=> [1,9], "ten"=> [1,10], 
            "eleven"=> [1,11], "twelve"=> [1,12], "thirteen"=> [1,13], 
            "fourteen"=> [1,14], "fifteen"=> [1,15], "sixteen"=> [1,16], 
            "seventeen"=> [1,17], "eighteen"=> [1,18], "nineteen"=> [1,19], 
            "twenty"=> [1,20], "thirty" => [1,30], "forty" => [1,40], 
            "fifty" => [1,50], "sixty" => [1,60], "seventy" => [1,70], 
            "eighty" => [1,80], "ninety" => [1,90],
            "hundred" => [100,0], "thousand" => [1000,0], 
            "million" => [1000000, 0]}

def text_2_int(string)
  numberWords = string.gsub('-', ' ').split(/ /) - %w{and}
  current = result = 0
  numberWords.each do |word|
    scale, increment = WORDNUMS[word]
    current = current * scale + increment
    if scale > 100
      result += current
      current = 0
    end
  end
  return result + current
end

我想处理像这样的字符串two thousand one hundred and forty-six

score 0 · Accepted Answer

这处理印度风格的单词中的数字，一些分数，数字和单词的组合以及加法。

def words_to_number(words):
    numbers = {"zero":0, "a":1, "half":0.5, "quarter":0.25, "one":1,"two":2,
               "three":3, "four":4,"five":5,"six":6,"seven":7,"eight":8,
               "nine":9, "ten":10,"eleven":11,"twelve":12, "thirteen":13,
               "fourteen":14, "fifteen":15,"sixteen":16,"seventeen":17,
               "eighteen":18,"nineteen":19, "twenty":20,"thirty":30, "forty":40,
               "fifty":50,"sixty":60,"seventy":70, "eighty":80,"ninety":90}

    groups = {"hundred":100, "thousand":1_000, 
              "lac":1_00_000, "lakh":1_00_000, 
              "million":1_000_000, "crore":10**7, 
              "billion":10**9, "trillion":10**12}
    
    split_at = ["and", "plus"]
    
    n = 0
    skip = False
    words_array = words.split(" ")
    for i, word in enumerate(words_array):
        if not skip:
            if word in groups:
                n*= groups[word]
            elif word in numbers:
                n += numbers[word]
            elif word in split_at:
                skip = True
                remaining = ' '.join(words_array[i+1:])
                n+=words_to_number(remaining)
            else:
                try:
                    n += float(word)
                except ValueError as e:
                    raise ValueError(f"Invalid word {word}") from e
    return n

测试：

print(words_to_number("a million and one"))
>> 1000001

print(words_to_number("one crore and one"))
>> 1000,0001

print(words_to_number("0.5 million one"))
>> 500001.0

print(words_to_number("half million and one hundred"))
>> 500100.0

print(words_to_number("quarter"))
>> 0.25

print(words_to_number("one hundred plus one"))
>> 101

score -1 · Accepted Answer

此代码适用于系列数据：

import pandas as pd
mylist = pd.Series(['one','two','three'])
mylist1 = []
for x in range(len(mylist)):
    mylist1.append(w2n.word_to_num(mylist[x]))
print(mylist1)

score -3 · Accepted Answer

此代码仅适用于99以下的数字。word to int和int to word（其余需要实现10-20行代码和简单的逻辑。这只是初学者的简单代码）：

num = input("Enter the number you want to convert : ")
mydict = {'1': 'One', '2': 'Two', '3': 'Three', '4': 'Four', '5': 'Five','6': 'Six', '7': 'Seven', '8': 'Eight', '9': 'Nine', '10': 'Ten','11': 'Eleven', '12': 'Twelve', '13': 'Thirteen', '14': 'Fourteen', '15': 'Fifteen', '16': 'Sixteen', '17': 'Seventeen', '18': 'Eighteen', '19': 'Nineteen'}
mydict2 = ['', '', 'Twenty', 'Thirty', 'Fourty', 'fifty', 'sixty', 'Seventy', 'Eighty', 'Ninty']

if num.isdigit():
    if(int(num) < 20):
        print(" :---> " + mydict[num])
    else:
        var1 = int(num) % 10
        var2 = int(num) / 10
        print(" :---> " + mydict2[int(var2)] + mydict[str(var1)])
else:
    num = num.lower()
    dict_w = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15, 'sixteen': 16, 'seventeen': '17', 'eighteen': '18', 'nineteen': '19'}
    mydict2 = ['', '', 'twenty', 'thirty', 'fourty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninty']
    divide = num[num.find("ty")+2:]
    if num:
        if(num in dict_w.keys()):
            print(" :---> " + str(dict_w[num]))
        elif divide == '' :
            for i in range(0, len(mydict2)-1):
                if mydict2[i] == num:
                    print(" :---> " + str(i * 10))
        else :
            str3 = 0
            str1 = num[num.find("ty")+2:]
            str2 = num[:-len(str1)]
            for i in range(0, len(mydict2)):
                if mydict2[i] == str2:
                    str3 = i
            if str2 not in mydict2:
                print("----->Invalid Input<-----")                
            else:
                try:
                    print(" :---> " + str((str3*10) + dict_w[str1]))
                except:
                    print("----->Invalid Input<-----")
    else:
        print("----->Please Enter Input<-----")

python - 有没有办法将数字单词转换为整数？

16 回答 16

Related

Reference