2020-02-07 17:24:10 +03:00
|
|
|
import random
|
|
|
|
from collections import deque
|
|
|
|
import re
|
|
|
|
|
|
|
|
class Dictogram(dict):
|
|
|
|
def __init__(self, iterable=None):
|
|
|
|
super(Dictogram, self).__init__()
|
|
|
|
self.types = 0
|
|
|
|
self.tokens = 0
|
|
|
|
if iterable:
|
|
|
|
self.update(iterable)
|
|
|
|
|
|
|
|
def update(self, iterable):
|
|
|
|
for item in iterable:
|
|
|
|
if item in self:
|
|
|
|
self[item] += 1
|
|
|
|
self.tokens += 1
|
|
|
|
else:
|
|
|
|
self[item] = 1
|
|
|
|
self.types += 1
|
|
|
|
self.tokens += 1
|
|
|
|
|
|
|
|
def count(self, item):
|
|
|
|
if item in self:
|
|
|
|
return self[item]
|
|
|
|
return 0
|
|
|
|
|
|
|
|
def return_random_word(self):
|
|
|
|
random_key = random.sample(self, 1)
|
|
|
|
return random_key[0]
|
|
|
|
|
|
|
|
def return_weighted_random_word(self):
|
|
|
|
random_int = random.randint(0, self.tokens-1)
|
|
|
|
index = 0
|
|
|
|
list_of_keys = list(self.keys())
|
|
|
|
for i in range(0, self.types):
|
|
|
|
index += self[list_of_keys[i]]
|
|
|
|
if(index > random_int):
|
|
|
|
return list_of_keys[i]
|
|
|
|
|
2020-02-07 14:51:19 +00:00
|
|
|
def get(text):
|
2020-02-07 17:24:10 +03:00
|
|
|
def generate_random_start(model):
|
|
|
|
if 'END' in model:
|
|
|
|
seed_word = 'END'
|
|
|
|
while seed_word == 'END':
|
|
|
|
seed_word = model['END'].return_weighted_random_word()
|
|
|
|
return seed_word
|
|
|
|
return random.choice(list(model.keys()))
|
|
|
|
|
|
|
|
|
|
|
|
def generate_random_sentence(length, markov_model):
|
|
|
|
current_word = generate_random_start(markov_model)
|
|
|
|
sentence = [current_word]
|
|
|
|
for i in range(0, length):
|
2020-02-07 14:51:19 +00:00
|
|
|
try:
|
|
|
|
current_dictogram = markov_model[current_word]
|
|
|
|
random_weighted_word = current_dictogram.return_weighted_random_word()
|
|
|
|
current_word = random_weighted_word
|
|
|
|
sentence.append(current_word)
|
|
|
|
except:
|
|
|
|
pass
|
2020-02-07 17:24:10 +03:00
|
|
|
sentence[0] = sentence[0].capitalize()
|
|
|
|
return ' '.join(sentence) + '.'
|
|
|
|
return sentence
|
|
|
|
|
|
|
|
def make_markov_model(data):
|
|
|
|
markov_model = dict()
|
|
|
|
|
|
|
|
for i in range(0, len(data)-1):
|
|
|
|
if data[i] in markov_model:
|
|
|
|
markov_model[data[i]].update([data[i+1]])
|
|
|
|
else:
|
|
|
|
markov_model[data[i]] = Dictogram([data[i+1]])
|
|
|
|
return markov_model
|
|
|
|
|
|
|
|
# simple cleanup
|
|
|
|
text = text.replace('—','')
|
|
|
|
text = text.replace('«','')
|
|
|
|
text = text.replace('»','')
|
|
|
|
text = text.replace('(','')
|
|
|
|
text = text.replace(')','')
|
|
|
|
text = "START " + text
|
|
|
|
text = text.replace('.', ' END')
|
|
|
|
|
|
|
|
text_list = text.split()
|
|
|
|
model = make_markov_model(text_list)
|
|
|
|
|
2020-02-07 14:51:19 +00:00
|
|
|
generated = generate_random_sentence(30, model)
|
2020-02-07 17:24:10 +03:00
|
|
|
generated = generated.replace(' END', '.')
|
2020-02-07 14:51:19 +00:00
|
|
|
return generated
|