blinkthethings
/
NaNoGenMo


								'''

								    markov.py - Generate a novel using Markov chains

								    Copyright (C) 2020  Blink The Things


								    This program is free software: you can redistribute it and/or modify

								    it under the terms of the GNU Affero General Public License as published by

								    the Free Software Foundation, either version 3 of the License, or

								    (at your option) any later version.


								    This program is distributed in the hope that it will be useful,

								    but WITHOUT ANY WARRANTY; without even the implied warranty of

								    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

								    GNU Affero General Public License for more details.


								    You should have received a copy of the GNU Affero General Public License

								    along with this program.  If not, see <https://www.gnu.org/licenses/>.

								'''


								import argparse

								import numpy as np

								import os

								import spacy

								import sys

								from textwrap import fill


								parser = argparse.ArgumentParser(description='Generate a novel using Markov chains.')

								parser.add_argument('input', nargs='+', help='used to construct Markov transition matrix')

								parser.add_argument('-w','--words', type=int, help='generate at least WORDS words')

								parser.add_argument('-s', '--seed', type=int, help='seed for random number generator')

								args = parser.parse_args()


								nlp = spacy.load('en_core_web_sm')


								rng = np.random.default_rng(args.seed or 12345)

								word_cnt = args.words or 100


								words = {}

								edges = []


								input_text = ''

								for infile in args.input:

								    with open(infile, mode='r') as f:

								        input_text = f.read()


								    i = 1000000

								    if len(input_text) > i:

								        while input_text[i] != ' ':

								            i -= 1


								    doc = nlp(input_text[:i])


								    for sent in doc.sents:

								        cnt = 0

								        for token in sent:

								            if token.pos_ in ('SPACE', 'PUNCT', 'X'):

								                continue


								            cnt += 1


								            word = token.text


								            state = f'{token.tag_},{token.dep_}'


								            if state in words:

								                words[state].append(word)

								            else:

								                words[state] = [word]


								            state = f'{token.tag_},{token.dep_}'


								            if state in words:

								                words[state].append(word)

								            else:

								                words[state] = [word]


								    for sent in doc.sents:

								        curr_state = 'START'


								        cnt = 0

								        for token in sent:

								            if token.pos_ in ('SPACE', 'PUNCT', 'X'):

								                continue


								            cnt += 1

								            next_state = f'{token.tag_},{token.dep_}'


								            edges.append((curr_state, next_state))


								            curr_state = next_state


								        edges.append((curr_state, 'STOP'))


								transitions = {}

								for edge in edges:

								    if edge[0] in transitions:

								        transitions[edge[0]]['cnt'] += 1

								        if edge[1] in transitions[edge[0]]['to']:

								            transitions[edge[0]]['to'][edge[1]] += 1

								        else:

								            transitions[edge[0]]['to'][edge[1]] = 1

								    else:

								        transitions[edge[0]] = { 'cnt': 1, 'to': {edge[1]: 1}}


								chain = {}

								for key in transitions.keys():

								    cnt = transitions[key]['cnt']

								    choices = list(transitions[key]['to'])

								    probs = []

								    for choice in choices:

								        probs.append(transitions[key]['to'][choice] / cnt)

								    chain[key] = { 'choices': choices, 'probs': probs}


								sents = []

								paragraphs = []

								paragraph_sent_cnt = rng.integers(5, 10)

								while word_cnt > 0:

								    choice = 'START'


								    choices = []

								    while True:

								        next_choice = rng.choice(chain[choice]['choices'], p=chain[choice]['probs'])


								        if choice == 'START' and next_choice == 'STOP':

								            continue


								        if next_choice == 'STOP':

								            sents.append(' '.join(choices)

								                            .replace(" '", "'")

								                            .replace(" ’", "’")

								                            .replace(" `", "`")

								                            + '. '

								                        )


								            paragraph_sent_cnt -= 1

								            if paragraph_sent_cnt < 0:

								                paragraphs.append(fill(''.join(sents), replace_whitespace=False, drop_whitespace=False))

								                sents = []

								                paragraph_sent_cnt = rng.integers(5, 10)

								            break


								        try:

								            word = rng.choice(words[next_choice])

								        except KeyError:

								            word = rng.choice(words[','.join(next_choice.split(',')[:-1])])


								        if choice == 'START' or word == 'i':

								            word = str.title(word)

								        elif not (next_choice.startswith('PROPN') or word == 'I'):

								            word = str.lower(word)


								        choices.append(word)


								        word_cnt -= 1


								        choice = next_choice


								print(f'{os.linesep}{os.linesep}'.join(paragraphs))