blinkthethings
/
NaNoGenMo


								'''

								    markov.py - Gernerate a novel using Markov chains

								    Copyright (C) 2020  Blink The Things


								    This program is free software: you can redistribute it and/or modify

								    it under the terms of the GNU Affero General Public License as published by

								    the Free Software Foundation, either version 3 of the License, or

								    (at your option) any later version.


								    This program is distributed in the hope that it will be useful,

								    but WITHOUT ANY WARRANTY; without even the implied warranty of

								    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

								    GNU Affero General Public License for more details.


								    You should have received a copy of the GNU Affero General Public License

								    along with this program.  If not, see <https://www.gnu.org/licenses/>.

								'''


								import argparse

								import numpy as np

								import os

								import spacy


								parser = argparse.ArgumentParser(description='Generate a novel using Markov chains.')

								parser.add_argument('word_file', help='File used for word selection')

								parser.add_argument('pos_file', help='File used to build part-of-speech Markov chain')

								args = parser.parse_args()


								nlp = spacy.load('en_core_web_sm')


								seed = 12345

								rng = np.random.default_rng(seed)


								words_text = ''

								with open(args.word_file, mode='r') as f:

								    words_text = f.read()


								words_doc = nlp(words_text)


								words = {}

								for sent in words_doc.sents:

								    for token in sent:

								        if token.pos_ in ('SPACE', 'PUNCT', 'X'):

								            continue


								        state = token.tag_

								        word = token.text


								        if state in words:

								            words[state].append(word)

								        else:

								            words[state] = [word]


								pos_text = ''

								with open(args.pos_file, mode='r') as f:

								    pos_text = f.read()


								pos_doc = nlp(pos_text)


								edges = []

								for sent in pos_doc.sents:

								    curr_state = 'START'


								    for token in sent:

								        if token.pos_ in ('SPACE', 'PUNCT', 'X'):

								            continue


								        next_state = token.tag_


								        edges.append((curr_state, next_state))


								        curr_state = next_state


								    edges.append((curr_state, 'STOP'))


								transitions = {}

								for edge in edges:

								    if edge[0] in transitions:

								        transitions[edge[0]]['cnt'] += 1

								        if edge[1] in transitions[edge[0]]['to']:

								            transitions[edge[0]]['to'][edge[1]] += 1

								        else:

								            transitions[edge[0]]['to'][edge[1]] = 1

								    else:

								        transitions[edge[0]] = { 'cnt': 1, 'to': {edge[1]: 1}}


								chain = {}

								for key in transitions.keys():

								    cnt = transitions[key]['cnt']

								    choices = list(transitions[key]['to'])

								    probs = []

								    for choice in choices:

								        probs.append(transitions[key]['to'][choice] / cnt)

								    chain[key] = { 'choices': choices, 'probs': probs}


								sents = []

								for _ in range(10):

								    choice = 'START'


								    choices = []

								    while True:

								        next_choice = rng.choice(chain[choice]['choices'], p=chain[choice]['probs'])


								        if choice == 'START' and next_choice == 'STOP':

								            continue


								        if next_choice == 'STOP':

								            sents.append(' '.join(choices))

								            break


								        word = rng.choice(words[next_choice])


								        choices.append(word)


								        choice = next_choice


								print(os.linesep.join(sents))