'''
|
|
markov.py - Generate a novel using Markov chains
|
|
Copyright (C) 2020 Blink The Things
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU Affero General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU Affero General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Affero General Public License
|
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
'''
|
|
|
|
import argparse
|
|
import numpy as np
|
|
import os
|
|
import spacy
|
|
import sys
|
|
from textwrap import fill
|
|
|
|
parser = argparse.ArgumentParser(description='Generate a novel using Markov chains.')
|
|
parser.add_argument('input', nargs='+', help='used to construct Markov transition matrix')
|
|
parser.add_argument('-w','--words', type=int, help='generate at least WORDS words')
|
|
parser.add_argument('-s', '--seed', type=int, help='seed for random number generator')
|
|
args = parser.parse_args()
|
|
|
|
nlp = spacy.load('en_core_web_sm')
|
|
|
|
rng = np.random.default_rng(args.seed or 12345)
|
|
word_cnt = args.words or 100
|
|
|
|
words = {}
|
|
edges = []
|
|
|
|
input_text = ''
|
|
for infile in args.input:
|
|
with open(infile, mode='r') as f:
|
|
input_text = f.read()
|
|
|
|
i = 1000000
|
|
if len(input_text) > i:
|
|
while input_text[i] != ' ':
|
|
i -= 1
|
|
|
|
doc = nlp(input_text[:i])
|
|
|
|
for sent in doc.sents:
|
|
cnt = 0
|
|
for token in sent:
|
|
if token.pos_ in ('SPACE', 'PUNCT', 'X'):
|
|
continue
|
|
|
|
cnt += 1
|
|
|
|
word = token.text
|
|
|
|
state = f'{token.tag_},{token.dep_}'
|
|
|
|
if state in words:
|
|
words[state].append(word)
|
|
else:
|
|
words[state] = [word]
|
|
|
|
state = f'{token.tag_},{token.dep_}'
|
|
|
|
if state in words:
|
|
words[state].append(word)
|
|
else:
|
|
words[state] = [word]
|
|
|
|
for sent in doc.sents:
|
|
curr_state = 'START'
|
|
|
|
cnt = 0
|
|
for token in sent:
|
|
if token.pos_ in ('SPACE', 'PUNCT', 'X'):
|
|
continue
|
|
|
|
cnt += 1
|
|
next_state = f'{token.tag_},{token.dep_}'
|
|
|
|
edges.append((curr_state, next_state))
|
|
|
|
curr_state = next_state
|
|
|
|
edges.append((curr_state, 'STOP'))
|
|
|
|
transitions = {}
|
|
for edge in edges:
|
|
if edge[0] in transitions:
|
|
transitions[edge[0]]['cnt'] += 1
|
|
if edge[1] in transitions[edge[0]]['to']:
|
|
transitions[edge[0]]['to'][edge[1]] += 1
|
|
else:
|
|
transitions[edge[0]]['to'][edge[1]] = 1
|
|
else:
|
|
transitions[edge[0]] = { 'cnt': 1, 'to': {edge[1]: 1}}
|
|
|
|
chain = {}
|
|
for key in transitions.keys():
|
|
cnt = transitions[key]['cnt']
|
|
choices = list(transitions[key]['to'])
|
|
probs = []
|
|
for choice in choices:
|
|
probs.append(transitions[key]['to'][choice] / cnt)
|
|
chain[key] = { 'choices': choices, 'probs': probs}
|
|
|
|
sents = []
|
|
paragraphs = []
|
|
paragraph_sent_cnt = rng.integers(5, 10)
|
|
while word_cnt > 0:
|
|
choice = 'START'
|
|
|
|
choices = []
|
|
while True:
|
|
next_choice = rng.choice(chain[choice]['choices'], p=chain[choice]['probs'])
|
|
|
|
if choice == 'START' and next_choice == 'STOP':
|
|
continue
|
|
|
|
if next_choice == 'STOP':
|
|
sents.append(' '.join(choices)
|
|
.replace(" '", "'")
|
|
.replace(" ’", "’")
|
|
.replace(" `", "`")
|
|
+ '. '
|
|
)
|
|
|
|
paragraph_sent_cnt -= 1
|
|
if paragraph_sent_cnt < 0:
|
|
paragraphs.append(fill(''.join(sents), replace_whitespace=False, drop_whitespace=False))
|
|
sents = []
|
|
paragraph_sent_cnt = rng.integers(5, 10)
|
|
break
|
|
|
|
try:
|
|
word = rng.choice(words[next_choice])
|
|
except KeyError:
|
|
word = rng.choice(words[','.join(next_choice.split(',')[:-1])])
|
|
|
|
if choice == 'START' or word == 'i':
|
|
word = str.title(word)
|
|
elif not (next_choice.startswith('PROPN') or word == 'I'):
|
|
word = str.lower(word)
|
|
|
|
choices.append(word)
|
|
|
|
word_cnt -= 1
|
|
|
|
choice = next_choice
|
|
|
|
print(f'{os.linesep}{os.linesep}'.join(paragraphs))
|