blinkthethings
/
NaNoGenMo


								'''

								    markov.py - Gernerate a novel using Markov chains

								    Copyright (C) 2020  Blink The Things


								    This program is free software: you can redistribute it and/or modify

								    it under the terms of the GNU Affero General Public License as published by

								    the Free Software Foundation, either version 3 of the License, or

								    (at your option) any later version.


								    This program is distributed in the hope that it will be useful,

								    but WITHOUT ANY WARRANTY; without even the implied warranty of

								    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

								    GNU Affero General Public License for more details.


								    You should have received a copy of the GNU Affero General Public License

								    along with this program.  If not, see <https://www.gnu.org/licenses/>.

								'''


								import argparse

								import numpy as np

								import os

								import spacy


								parser = argparse.ArgumentParser(description='Generate a novel using Markov chains.')

								parser.add_argument('input', nargs='+', help='used to construct Markov transition matrix')

								parser.add_argument('-s', '--seed', type=int, help='seed for random number generator')

								args = parser.parse_args()


								nlp = spacy.load('en_core_web_sm')


								rng = np.random.default_rng(args.seed or 12345)


								words = {}

								edges = []


								input_text = ''

								for infile in args.input:

								    with open(infile, mode='r') as f:

								        input_text = f.read()


								    doc = nlp(input_text)


								    for sent in doc.sents:

								        cnt = 0

								        for token in sent:

								            if token.pos_ in ('SPACE', 'PUNCT', 'X'):

								                continue


								            cnt += 1


								            word = token.text


								            state = f'{token.tag_},{token.dep_}'


								            if state in words:

								                words[state].append(word)

								            else:

								                words[state] = [word]


								            state = f'{token.tag_},{token.dep_},{str(cnt)}'


								            if state in words:

								                words[state].append(word)

								            else:

								                words[state] = [word]


								    for sent in doc.sents:

								        curr_state = 'START'


								        cnt = 0

								        for token in sent:

								            if token.pos_ in ('SPACE', 'PUNCT', 'X'):

								                continue


								            cnt += 1

								            next_state = f'{token.tag_},{token.dep_},{str(cnt)}'


								            edges.append((curr_state, next_state))


								            curr_state = next_state


								        edges.append((curr_state, 'STOP'))


								transitions = {}

								for edge in edges:

								    if edge[0] in transitions:

								        transitions[edge[0]]['cnt'] += 1

								        if edge[1] in transitions[edge[0]]['to']:

								            transitions[edge[0]]['to'][edge[1]] += 1

								        else:

								            transitions[edge[0]]['to'][edge[1]] = 1

								    else:

								        transitions[edge[0]] = { 'cnt': 1, 'to': {edge[1]: 1}}


								chain = {}

								for key in transitions.keys():

								    cnt = transitions[key]['cnt']

								    choices = list(transitions[key]['to'])

								    probs = []

								    for choice in choices:

								        probs.append(transitions[key]['to'][choice] / cnt)

								    chain[key] = { 'choices': choices, 'probs': probs}


								sents = []

								for _ in range(10):

								    choice = 'START'


								    choices = []

								    while True:

								        next_choice = rng.choice(chain[choice]['choices'], p=chain[choice]['probs'])


								        if choice == 'START' and next_choice == 'STOP':

								            continue


								        if next_choice == 'STOP':

								            sents.append(' '.join(choices))

								            break


								        try:

								            word = rng.choice(words[next_choice])

								        except KeyError:

								            word = rng.choice(words[','.join(next_choice.split(',')[:-1])])


								        choices.append(word)


								        choice = next_choice


								print(os.linesep.join(sents))