''' markov.py - Generate a novel using Markov chains Copyright (C) 2020 Blink The Things This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . ''' import argparse import numpy as np import os import spacy import sys from textwrap import fill parser = argparse.ArgumentParser(description='Generate a novel using Markov chains.') parser.add_argument('input', nargs='+', help='used to construct Markov transition matrix') parser.add_argument('-w','--words', type=int, help='generate at least WORDS words') parser.add_argument('-s', '--seed', type=int, help='seed for random number generator') args = parser.parse_args() nlp = spacy.load('en_core_web_sm') rng = np.random.default_rng(args.seed or 12345) word_cnt = args.words or 100 words = {} edges = [] input_text = '' for infile in args.input: with open(infile, mode='r') as f: input_text = f.read() i = 1000000 if len(input_text) > i: while input_text[i] != ' ': i -= 1 doc = nlp(input_text[:i]) for sent in doc.sents: cnt = 0 for token in sent: if token.pos_ in ('SPACE', 'PUNCT', 'X'): continue cnt += 1 word = token.text state = f'{token.tag_},{token.dep_}' if state in words: words[state].append(word) else: words[state] = [word] state = f'{token.tag_},{token.dep_}' if state in words: words[state].append(word) else: words[state] = [word] for sent in doc.sents: curr_state = 'START' cnt = 0 for token in sent: if token.pos_ in ('SPACE', 'PUNCT', 'X'): continue cnt += 1 next_state = f'{token.tag_},{token.dep_}' edges.append((curr_state, next_state)) curr_state = next_state edges.append((curr_state, 'STOP')) transitions = {} for edge in edges: if edge[0] in transitions: transitions[edge[0]]['cnt'] += 1 if edge[1] in transitions[edge[0]]['to']: transitions[edge[0]]['to'][edge[1]] += 1 else: transitions[edge[0]]['to'][edge[1]] = 1 else: transitions[edge[0]] = { 'cnt': 1, 'to': {edge[1]: 1}} chain = {} for key in transitions.keys(): cnt = transitions[key]['cnt'] choices = list(transitions[key]['to']) probs = [] for choice in choices: probs.append(transitions[key]['to'][choice] / cnt) chain[key] = { 'choices': choices, 'probs': probs} sents = [] paragraphs = [] paragraph_sent_cnt = rng.integers(5, 10) while word_cnt > 0: choice = 'START' choices = [] while True: next_choice = rng.choice(chain[choice]['choices'], p=chain[choice]['probs']) if choice == 'START' and next_choice == 'STOP': continue if next_choice == 'STOP': sents.append(' '.join(choices) .replace(" '", "'") .replace(" ’", "’") .replace(" `", "`") + '. ' ) paragraph_sent_cnt -= 1 if paragraph_sent_cnt < 0: paragraphs.append(fill(''.join(sents), replace_whitespace=False, drop_whitespace=False)) sents = [] paragraph_sent_cnt = rng.integers(5, 10) break try: word = rng.choice(words[next_choice]) except KeyError: word = rng.choice(words[','.join(next_choice.split(',')[:-1])]) if choice == 'START' or word == 'i': word = str.title(word) elif not (next_choice.startswith('PROPN') or word == 'I'): word = str.lower(word) choices.append(word) word_cnt -= 1 choice = next_choice print(f'{os.linesep}{os.linesep}'.join(paragraphs))