''' markov.py - Gernerate a novel using Markov chains Copyright (C) 2020 Blink The Things This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . ''' import argparse import numpy as np import os import spacy parser = argparse.ArgumentParser(description='Generate a novel using Markov chains.') parser.add_argument('word_file', help='file used for word selection') parser.add_argument('pos_file', help='file used to build part-of-speech Markov chain') parser.add_argument('-s', '--seed', type=int, help='seed for random number generator') args = parser.parse_args() nlp = spacy.load('en_core_web_sm') seed = args.seed or 12345 rng = np.random.default_rng(seed) words_text = '' with open(args.word_file, mode='r') as f: words_text = f.read() words_doc = nlp(words_text) words = {} for sent in words_doc.sents: for token in sent: if token.pos_ in ('SPACE', 'PUNCT', 'X'): continue state = token.tag_ word = token.text if state in words: words[state].append(word) else: words[state] = [word] pos_text = '' with open(args.pos_file, mode='r') as f: pos_text = f.read() pos_doc = nlp(pos_text) edges = [] for sent in pos_doc.sents: curr_state = 'START' for token in sent: if token.pos_ in ('SPACE', 'PUNCT', 'X'): continue next_state = token.tag_ edges.append((curr_state, next_state)) curr_state = next_state edges.append((curr_state, 'STOP')) transitions = {} for edge in edges: if edge[0] in transitions: transitions[edge[0]]['cnt'] += 1 if edge[1] in transitions[edge[0]]['to']: transitions[edge[0]]['to'][edge[1]] += 1 else: transitions[edge[0]]['to'][edge[1]] = 1 else: transitions[edge[0]] = { 'cnt': 1, 'to': {edge[1]: 1}} chain = {} for key in transitions.keys(): cnt = transitions[key]['cnt'] choices = list(transitions[key]['to']) probs = [] for choice in choices: probs.append(transitions[key]['to'][choice] / cnt) chain[key] = { 'choices': choices, 'probs': probs} sents = [] for _ in range(10): choice = 'START' choices = [] while True: next_choice = rng.choice(chain[choice]['choices'], p=chain[choice]['probs']) if choice == 'START' and next_choice == 'STOP': continue if next_choice == 'STOP': sents.append(' '.join(choices)) break word = rng.choice(words[next_choice]) choices.append(word) choice = next_choice print(os.linesep.join(sents))