Source code for simudo.util.string_system
import functools
from collections import defaultdict
from itertools import chain as ichain
from itertools import combinations
from suffix_trees.STree import STree
__all__ = [
'make_string_system',
'format_string_system']
def iterjoin(separator, iterable):
iterable = iter(iterable)
yield next(iterable)
for x in iterable:
yield separator
yield x
def all_lcs(stree, minimum_length=0):
num_strings = len(stree.word_starts)
seen = set()
def f(x):
# wondering about the 'list'?
# upstream bug, extraneous typecheck:
# if stringIdxs == -1 or not isinstance(stringIdxs, list):
y = stree.lcs(list(x))
if len(y) >= minimum_length and y not in seen:
seen.add(y)
for i in range(num_strings):
if i not in x:
f(x.union((i,)))
for x in combinations(range(num_strings), 2):
f(frozenset(x))
return seen
def default_score(num_occurrences, len_lcs):
return len_lcs*num_occurrences
[docs]def make_string_system(
strings,
min_length=6,
score=default_score):
num_strings = len(strings)
strings = [(x,) for x in strings]
def_index = 0
while True:
sstrings = [x for s in strings for x in s if isinstance(x, str)]
stree = STree(sstrings)
lcss = all_lcs(stree, minimum_length=min_length)
if not lcss:
break
best = max(lcss, key=lambda sub:
score(sum(s.count(sub) for s in sstrings), len(sub)))
strings = [list(ichain.from_iterable(
(x,) if not isinstance(x, str)
else iterjoin(def_index, x.split(best))
for x in s))
for s in strings]
strings.append((best,))
def_index += 1
return (strings[:num_strings], strings[num_strings:])
# print(format_string_system(string_system([
# "function arguments of different types will be cached separately",
# "bound function is periodically called with the same arguments",
# 'Apply function of two arguments cumulatively to the items of sequence',
# 'argumentative individual'], 5)))