Generate a novel using Markov chains
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

128 lines
3.5 KiB

'''
markov.py - Gernerate a novel using Markov chains
Copyright (C) 2020 Blink The Things
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
'''
import argparse
import numpy as np
import os
import spacy
parser = argparse.ArgumentParser(description='Generate a novel using Markov chains.')
parser.add_argument('input', nargs='+', help='used to construct Markov transition matrix')
parser.add_argument('-s', '--seed', type=int, help='seed for random number generator')
args = parser.parse_args()
nlp = spacy.load('en_core_web_sm')
rng = np.random.default_rng(args.seed or 12345)
words = {}
edges = []
input_text = ''
for infile in args.input:
with open(infile, mode='r') as f:
input_text = f.read()
doc = nlp(input_text)
for sent in doc.sents:
cnt = 0
for token in sent:
if token.pos_ in ('SPACE', 'PUNCT', 'X'):
continue
cnt += 1
word = token.text
state = f'{token.tag_},{token.dep_}'
if state in words:
words[state].append(word)
else:
words[state] = [word]
state = f'{token.tag_},{token.dep_},{str(cnt)}'
if state in words:
words[state].append(word)
else:
words[state] = [word]
for sent in doc.sents:
curr_state = 'START'
cnt = 0
for token in sent:
if token.pos_ in ('SPACE', 'PUNCT', 'X'):
continue
cnt += 1
next_state = f'{token.tag_},{token.dep_},{str(cnt)}'
edges.append((curr_state, next_state))
curr_state = next_state
edges.append((curr_state, 'STOP'))
transitions = {}
for edge in edges:
if edge[0] in transitions:
transitions[edge[0]]['cnt'] += 1
if edge[1] in transitions[edge[0]]['to']:
transitions[edge[0]]['to'][edge[1]] += 1
else:
transitions[edge[0]]['to'][edge[1]] = 1
else:
transitions[edge[0]] = { 'cnt': 1, 'to': {edge[1]: 1}}
chain = {}
for key in transitions.keys():
cnt = transitions[key]['cnt']
choices = list(transitions[key]['to'])
probs = []
for choice in choices:
probs.append(transitions[key]['to'][choice] / cnt)
chain[key] = { 'choices': choices, 'probs': probs}
sents = []
for _ in range(10):
choice = 'START'
choices = []
while True:
next_choice = rng.choice(chain[choice]['choices'], p=chain[choice]['probs'])
if choice == 'START' and next_choice == 'STOP':
continue
if next_choice == 'STOP':
sents.append(' '.join(choices))
break
try:
word = rng.choice(words[next_choice])
except KeyError:
word = rng.choice(words[','.join(next_choice.split(',')[:-1])])
choices.append(word)
choice = next_choice
print(os.linesep.join(sents))