mirror of
https://github.com/rembo10/headphones.git
synced 2026-01-10 07:17:59 -05:00
219 lines
6.6 KiB
Python
219 lines
6.6 KiB
Python
from .compat import _range
|
|
|
|
_s2_options = {
|
|
'a': ((['a', 't', 'i', 'o', 'n', 'a', 'l'], ['a', 't', 'e']),
|
|
(['t', 'i', 'o', 'n', 'a', 'l'], ['t', 'i', 'o', 'n'])),
|
|
'c': ((['e', 'n', 'c', 'i'], ['e', 'n', 'c', 'e']),
|
|
(['a', 'n', 'c', 'i'], ['a', 'n', 'c', 'e']),),
|
|
'e': ((['i', 'z', 'e', 'r'], ['i', 'z', 'e']),),
|
|
'l': ((['b', 'l', 'i'], ['b', 'l', 'e']),
|
|
(['a', 'l', 'l', 'i'], ['a', 'l']),
|
|
(['e', 'n', 't', 'l', 'i'], ['e', 'n', 't']),
|
|
(['e', 'l', 'i'], ['e']),
|
|
(['o', 'u', 's', 'l', 'i'], ['o', 'u', 's']),),
|
|
'o': ((['i', 'z', 'a', 't', 'i', 'o', 'n'], ['i', 'z', 'e']),
|
|
(['a', 't', 'i', 'o', 'n'], ['a', 't', 'e']),
|
|
(['a', 't', 'o', 'r'], ['a', 't', 'e']),),
|
|
's': ((['a', 'l', 'i', 's', 'm'], ['a', 'l']),
|
|
(['i', 'v', 'e', 'n', 'e', 's', 's'], ['i', 'v', 'e']),
|
|
(['f', 'u', 'l', 'n', 'e', 's', 's'], ['f', 'u', 'l']),
|
|
(['o', 'u', 's', 'n', 'e', 's', 's'], ['o', 'u', 's']),),
|
|
't': ((['a', 'l', 'i', 't', 'i'], ['a', 'l']),
|
|
(['i', 'v', 'i', 't', 'i'], ['i', 'v', 'e']),
|
|
(['b', 'i', 'l', 'i', 't', 'i'], ['b', 'l', 'e']),),
|
|
'g': ((['l', 'o', 'g', 'i'], ['l', 'o', 'g']),),
|
|
}
|
|
|
|
|
|
_s3_options = {
|
|
'e': ((['i', 'c', 'a', 't', 'e'], ['i', 'c']),
|
|
(['a', 't', 'i', 'v', 'e'], []),
|
|
(['a', 'l', 'i', 'z', 'e'], ['a', 'l']),),
|
|
'i': ((['i', 'c', 'i', 't', 'i'], ['i', 'c']),),
|
|
'l': ((['i', 'c', 'a', 'l'], ['i', 'c']),
|
|
(['f', 'u', 'l'], []),),
|
|
's': ((['n', 'e', 's', 's'], []),),
|
|
}
|
|
|
|
_s4_endings = {
|
|
'a': (['a', 'l'],),
|
|
'c': (['a', 'n', 'c', 'e'], ['e', 'n', 'c', 'e']),
|
|
'e': (['e', 'r'],),
|
|
'i': (['i', 'c'],),
|
|
'l': (['a', 'b', 'l', 'e'], ['i', 'b', 'l', 'e']),
|
|
'n': (['a', 'n', 't'], ['e', 'm', 'e', 'n', 't'], ['m', 'e', 'n', 't'],
|
|
['e', 'n', 't']),
|
|
# handle 'o' separately
|
|
's': (['i', 's', 'm'],),
|
|
't': (['a', 't', 'e'], ['i', 't', 'i']),
|
|
'u': (['o', 'u', 's'],),
|
|
'v': (['i', 'v', 'e'],),
|
|
'z': (['i', 'z', 'e'],),
|
|
}
|
|
|
|
|
|
class Stemmer(object):
|
|
def __init__(self, b):
|
|
self.b = list(b)
|
|
self.k = len(b)-1
|
|
self.j = 0
|
|
|
|
def cons(self, i):
|
|
""" True iff b[i] is a consonant """
|
|
if self.b[i] in 'aeiou':
|
|
return False
|
|
elif self.b[i] == 'y':
|
|
return True if i == 0 else not self.cons(i-1)
|
|
return True
|
|
|
|
def m(self):
|
|
n = i = 0
|
|
while True:
|
|
if i > self.j:
|
|
return n
|
|
if not self.cons(i):
|
|
break
|
|
i += 1
|
|
i += 1
|
|
while True:
|
|
while True:
|
|
if i > self.j:
|
|
return n
|
|
if self.cons(i):
|
|
break
|
|
i += 1
|
|
|
|
i += 1
|
|
n += 1
|
|
|
|
while True:
|
|
if i > self.j:
|
|
return n
|
|
if not self.cons(i):
|
|
break
|
|
i += 1
|
|
i += 1
|
|
|
|
def vowel_in_stem(self):
|
|
""" True iff 0...j contains vowel """
|
|
for i in _range(0, self.j+1):
|
|
if not self.cons(i):
|
|
return True
|
|
return False
|
|
|
|
def doublec(self, j):
|
|
""" True iff j, j-1 contains double consonant """
|
|
if j < 1 or self.b[j] != self.b[j-1]:
|
|
return False
|
|
return self.cons(j)
|
|
|
|
def cvc(self, i):
|
|
""" True iff i-2,i-1,i is consonent-vowel consonant
|
|
and if second c isn't w,x, or y.
|
|
used to restore e at end of short words like cave, love, hope, crime
|
|
"""
|
|
if (i < 2 or not self.cons(i) or self.cons(i-1) or not self.cons(i-2) or
|
|
self.b[i] in 'wxy'):
|
|
return False
|
|
return True
|
|
|
|
def ends(self, s):
|
|
length = len(s)
|
|
""" True iff 0...k ends with string s """
|
|
res = (self.b[self.k-length+1:self.k+1] == s)
|
|
if res:
|
|
self.j = self.k - length
|
|
return res
|
|
|
|
def setto(self, s):
|
|
""" set j+1...k to string s, readjusting k """
|
|
length = len(s)
|
|
self.b[self.j+1:self.j+1+length] = s
|
|
self.k = self.j + length
|
|
|
|
def r(self, s):
|
|
if self.m() > 0:
|
|
self.setto(s)
|
|
|
|
def step1ab(self):
|
|
if self.b[self.k] == 's':
|
|
if self.ends(['s', 's', 'e', 's']):
|
|
self.k -= 2
|
|
elif self.ends(['i', 'e', 's']):
|
|
self.setto(['i'])
|
|
elif self.b[self.k-1] != 's':
|
|
self.k -= 1
|
|
if self.ends(['e', 'e', 'd']):
|
|
if self.m() > 0:
|
|
self.k -= 1
|
|
elif ((self.ends(['e', 'd']) or self.ends(['i', 'n', 'g'])) and
|
|
self.vowel_in_stem()):
|
|
self.k = self.j
|
|
if self.ends(['a', 't']):
|
|
self.setto(['a', 't', 'e'])
|
|
elif self.ends(['b', 'l']):
|
|
self.setto(['b', 'l', 'e'])
|
|
elif self.ends(['i', 'z']):
|
|
self.setto(['i', 'z', 'e'])
|
|
elif self.doublec(self.k):
|
|
self.k -= 1
|
|
if self.b[self.k] in 'lsz':
|
|
self.k += 1
|
|
elif self.m() == 1 and self.cvc(self.k):
|
|
self.setto(['e'])
|
|
|
|
def step1c(self):
|
|
""" turn terminal y into i if there's a vowel in stem """
|
|
if self.ends(['y']) and self.vowel_in_stem():
|
|
self.b[self.k] = 'i'
|
|
|
|
def step2and3(self):
|
|
for end, repl in _s2_options.get(self.b[self.k-1], []):
|
|
if self.ends(end):
|
|
self.r(repl)
|
|
break
|
|
|
|
for end, repl in _s3_options.get(self.b[self.k], []):
|
|
if self.ends(end):
|
|
self.r(repl)
|
|
break
|
|
|
|
def step4(self):
|
|
ch = self.b[self.k-1]
|
|
|
|
if ch == 'o':
|
|
if not ((self.ends(['i', 'o', 'n']) and self.b[self.j] in 'st') or
|
|
self.ends(['o', 'u'])):
|
|
return
|
|
else:
|
|
endings = _s4_endings.get(ch, [])
|
|
for end in endings:
|
|
if self.ends(end):
|
|
break
|
|
else:
|
|
return
|
|
|
|
if self.m() > 1:
|
|
self.k = self.j
|
|
|
|
def step5(self):
|
|
self.j = self.k
|
|
if self.b[self.k] == 'e':
|
|
a = self.m()
|
|
if a > 1 or a == 1 and not self.cvc(self.k-1):
|
|
self.k -= 1
|
|
if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
|
|
self.k -= 1
|
|
|
|
def result(self):
|
|
return ''.join(self.b[:self.k+1])
|
|
|
|
def stem(self):
|
|
if self.k > 1:
|
|
self.step1ab()
|
|
self.step1c()
|
|
self.step2and3()
|
|
self.step4()
|
|
self.step5()
|
|
return self.result()
|