blinkthethings
/
NaNoGenMo

'''
    markov.py - Generate a novel using Markov chains    Copyright (C) 2020  Blink The Things
    This program is free software: you can redistribute it and/or modify    it under the terms of the GNU Affero General Public License as published by    the Free Software Foundation, either version 3 of the License, or    (at your option) any later version.
    This program is distributed in the hope that it will be useful,    but WITHOUT ANY WARRANTY; without even the implied warranty of    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    GNU Affero General Public License for more details.
    You should have received a copy of the GNU Affero General Public License    along with this program.  If not, see <https://www.gnu.org/licenses/>.'''

import argparseimport numpy as npimport osimport spacyimport sysfrom textwrap import fill
parser = argparse.ArgumentParser(description='Generate a novel using Markov chains.')parser.add_argument('input', nargs='+', help='used to construct Markov transition matrix')parser.add_argument('-w','--words', type=int, help='generate at least WORDS words')parser.add_argument('-s', '--seed', type=int, help='seed for random number generator')args = parser.parse_args()
nlp = spacy.load('en_core_web_sm')
rng = np.random.default_rng(args.seed or 12345)word_cnt = args.words or 100
words = {}edges = []
input_text = ''for infile in args.input:    with open(infile, mode='r') as f:        input_text = f.read()
    i = 1000000    if len(input_text) > i:        while input_text[i] != ' ':            i -= 1
    doc = nlp(input_text[:i])
    for sent in doc.sents:        cnt = 0        for token in sent:            if token.pos_ in ('SPACE', 'PUNCT', 'X'):                continue
            cnt += 1
            word = token.text
            state = f'{token.tag_},{token.dep_}'
            if state in words:                words[state].append(word)            else:                words[state] = [word]
            state = f'{token.tag_},{token.dep_}'
            if state in words:                words[state].append(word)            else:                words[state] = [word]
    for sent in doc.sents:        curr_state = 'START'
        cnt = 0        for token in sent:            if token.pos_ in ('SPACE', 'PUNCT', 'X'):                continue
            cnt += 1            next_state = f'{token.tag_},{token.dep_}'
            edges.append((curr_state, next_state))
            curr_state = next_state
        edges.append((curr_state, 'STOP'))
transitions = {}for edge in edges:    if edge[0] in transitions:        transitions[edge[0]]['cnt'] += 1        if edge[1] in transitions[edge[0]]['to']:            transitions[edge[0]]['to'][edge[1]] += 1        else:            transitions[edge[0]]['to'][edge[1]] = 1    else:        transitions[edge[0]] = { 'cnt': 1, 'to': {edge[1]: 1}}
chain = {}for key in transitions.keys():    cnt = transitions[key]['cnt']    choices = list(transitions[key]['to'])    probs = []    for choice in choices:        probs.append(transitions[key]['to'][choice] / cnt)    chain[key] = { 'choices': choices, 'probs': probs}
sents = []paragraphs = []paragraph_sent_cnt = rng.integers(5, 10)while word_cnt > 0:    choice = 'START'
    choices = []    while True:        next_choice = rng.choice(chain[choice]['choices'], p=chain[choice]['probs'])
        if choice == 'START' and next_choice == 'STOP':            continue
        if next_choice == 'STOP':            sents.append(' '.join(choices)                            .replace(" '", "'")                            .replace(" ’", "’")                            .replace(" `", "`")                            + '. '                        )
            paragraph_sent_cnt -= 1            if paragraph_sent_cnt < 0:                paragraphs.append(fill(''.join(sents), replace_whitespace=False, drop_whitespace=False))                sents = []                paragraph_sent_cnt = rng.integers(5, 10)            break
        try:            word = rng.choice(words[next_choice])        except KeyError:            word = rng.choice(words[','.join(next_choice.split(',')[:-1])])
        if choice == 'START' or word == 'i':            word = str.title(word)        elif not (next_choice.startswith('PROPN') or word == 'I'):            word = str.lower(word)
        choices.append(word)
        word_cnt -= 1
        choice = next_choice
print(f'{os.linesep}{os.linesep}'.join(paragraphs))