From dc1cf4bdaa42eab68209a7a4fed4623317cd5ae7 Mon Sep 17 00:00:00 2001 From: Blink The Things Date: Tue, 3 Nov 2020 22:10:19 -0500 Subject: [PATCH] Change the way input files are used --- markov.py | 80 +++++++++++++++++++++++++------------------------------ 1 file changed, 36 insertions(+), 44 deletions(-) diff --git a/markov.py b/markov.py index b46a9de..ddb69c7 100644 --- a/markov.py +++ b/markov.py @@ -22,72 +22,64 @@ import os import spacy parser = argparse.ArgumentParser(description='Generate a novel using Markov chains.') -parser.add_argument('word_file', help='file used for word selection') -parser.add_argument('pos_file', help='file used to build part-of-speech Markov chain') +parser.add_argument('input', nargs='+', help='used to construct Markov transition matrix') parser.add_argument('-s', '--seed', type=int, help='seed for random number generator') args = parser.parse_args() nlp = spacy.load('en_core_web_sm') -seed = args.seed or 12345 - -rng = np.random.default_rng(seed) - -words_text = '' -with open(args.word_file, mode='r') as f: - words_text = f.read() - -words_doc = nlp(words_text) +rng = np.random.default_rng(args.seed or 12345) words = {} -for sent in words_doc.sents: - cnt = 0 - for token in sent: - if token.pos_ in ('SPACE', 'PUNCT', 'X'): - continue +edges = [] - cnt += 1 +input_text = '' +for infile in args.input: + with open(infile, mode='r') as f: + input_text = f.read() - word = token.text + doc = nlp(input_text) - state = f'{token.tag_},{token.dep_}' + for sent in doc.sents: + cnt = 0 + for token in sent: + if token.pos_ in ('SPACE', 'PUNCT', 'X'): + continue - if state in words: - words[state].append(word) - else: - words[state] = [word] + cnt += 1 - state = f'{token.tag_},{token.dep_},{str(cnt)}' + word = token.text - if state in words: - words[state].append(word) - else: - words[state] = [word] + state = f'{token.tag_},{token.dep_}' + if state in words: + words[state].append(word) + else: + words[state] = [word] -pos_text = '' -with open(args.pos_file, mode='r') as f: - pos_text = f.read() + state = f'{token.tag_},{token.dep_},{str(cnt)}' -pos_doc = nlp(pos_text) + if state in words: + words[state].append(word) + else: + words[state] = [word] -edges = [] -for sent in pos_doc.sents: - curr_state = 'START' + for sent in doc.sents: + curr_state = 'START' - cnt = 0 - for token in sent: - if token.pos_ in ('SPACE', 'PUNCT', 'X'): - continue + cnt = 0 + for token in sent: + if token.pos_ in ('SPACE', 'PUNCT', 'X'): + continue - cnt += 1 - next_state = f'{token.tag_},{token.dep_},{str(cnt)}' + cnt += 1 + next_state = f'{token.tag_},{token.dep_},{str(cnt)}' - edges.append((curr_state, next_state)) + edges.append((curr_state, next_state)) - curr_state = next_state + curr_state = next_state - edges.append((curr_state, 'STOP')) + edges.append((curr_state, 'STOP')) transitions = {} for edge in edges: