blinkthethings
/
NaNoGenMo

'''
    markov.py - Gernerate a novel using Markov chains    Copyright (C) 2020  Blink The Things
    This program is free software: you can redistribute it and/or modify    it under the terms of the GNU Affero General Public License as published by    the Free Software Foundation, either version 3 of the License, or    (at your option) any later version.
    This program is distributed in the hope that it will be useful,    but WITHOUT ANY WARRANTY; without even the implied warranty of    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    GNU Affero General Public License for more details.
    You should have received a copy of the GNU Affero General Public License    along with this program.  If not, see <https://www.gnu.org/licenses/>.'''

import argparseimport numpy as npimport osimport spacy
parser = argparse.ArgumentParser(description='Generate a novel using Markov chains.')parser.add_argument('word_file', help='File used for word selection')parser.add_argument('pos_file', help='File used to build part-of-speech Markov chain')args = parser.parse_args()
nlp = spacy.load('en_core_web_sm')
seed = 12345rng = np.random.default_rng(seed)
words_text = ''with open(args.word_file, mode='r') as f:    words_text = f.read()
words_doc = nlp(words_text)
words = {}for sent in words_doc.sents:    for token in sent:        if token.pos_ in ('SPACE', 'PUNCT', 'X'):            continue
        state = token.tag_        word = token.text
        if state in words:            words[state].append(word)        else:            words[state] = [word]
pos_text = ''with open(args.pos_file, mode='r') as f:    pos_text = f.read()
pos_doc = nlp(pos_text)
edges = []for sent in pos_doc.sents:    curr_state = 'START'
    for token in sent:        if token.pos_ in ('SPACE', 'PUNCT', 'X'):            continue
        next_state = token.tag_
        edges.append((curr_state, next_state))
        curr_state = next_state
    edges.append((curr_state, 'STOP'))
transitions = {}for edge in edges:    if edge[0] in transitions:        transitions[edge[0]]['cnt'] += 1        if edge[1] in transitions[edge[0]]['to']:            transitions[edge[0]]['to'][edge[1]] += 1        else:            transitions[edge[0]]['to'][edge[1]] = 1    else:        transitions[edge[0]] = { 'cnt': 1, 'to': {edge[1]: 1}}
chain = {}for key in transitions.keys():    cnt = transitions[key]['cnt']    choices = list(transitions[key]['to'])    probs = []    for choice in choices:        probs.append(transitions[key]['to'][choice] / cnt)    chain[key] = { 'choices': choices, 'probs': probs}
sents = [] for _ in range(10):    choice = 'START'
    choices = []    while True:        next_choice = rng.choice(chain[choice]['choices'], p=chain[choice]['probs'])
        if choice == 'START' and next_choice == 'STOP':            continue
        if next_choice == 'STOP':            sents.append(' '.join(choices))            break
        word = rng.choice(words[next_choice])
        choices.append(word)
        choice = next_choice
print(os.linesep.join(sents))