|
@ -22,72 +22,64 @@ import os |
|
|
import spacy |
|
|
import spacy |
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(description='Generate a novel using Markov chains.') |
|
|
parser = argparse.ArgumentParser(description='Generate a novel using Markov chains.') |
|
|
parser.add_argument('word_file', help='file used for word selection') |
|
|
|
|
|
parser.add_argument('pos_file', help='file used to build part-of-speech Markov chain') |
|
|
|
|
|
|
|
|
parser.add_argument('input', nargs='+', help='used to construct Markov transition matrix') |
|
|
parser.add_argument('-s', '--seed', type=int, help='seed for random number generator') |
|
|
parser.add_argument('-s', '--seed', type=int, help='seed for random number generator') |
|
|
args = parser.parse_args() |
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
nlp = spacy.load('en_core_web_sm') |
|
|
nlp = spacy.load('en_core_web_sm') |
|
|
|
|
|
|
|
|
seed = args.seed or 12345 |
|
|
|
|
|
|
|
|
|
|
|
rng = np.random.default_rng(seed) |
|
|
|
|
|
|
|
|
|
|
|
words_text = '' |
|
|
|
|
|
with open(args.word_file, mode='r') as f: |
|
|
|
|
|
words_text = f.read() |
|
|
|
|
|
|
|
|
|
|
|
words_doc = nlp(words_text) |
|
|
|
|
|
|
|
|
rng = np.random.default_rng(args.seed or 12345) |
|
|
|
|
|
|
|
|
words = {} |
|
|
words = {} |
|
|
for sent in words_doc.sents: |
|
|
|
|
|
cnt = 0 |
|
|
|
|
|
for token in sent: |
|
|
|
|
|
if token.pos_ in ('SPACE', 'PUNCT', 'X'): |
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
edges = [] |
|
|
|
|
|
|
|
|
cnt += 1 |
|
|
|
|
|
|
|
|
input_text = '' |
|
|
|
|
|
for infile in args.input: |
|
|
|
|
|
with open(infile, mode='r') as f: |
|
|
|
|
|
input_text = f.read() |
|
|
|
|
|
|
|
|
word = token.text |
|
|
|
|
|
|
|
|
doc = nlp(input_text) |
|
|
|
|
|
|
|
|
state = f'{token.tag_},{token.dep_}' |
|
|
|
|
|
|
|
|
for sent in doc.sents: |
|
|
|
|
|
cnt = 0 |
|
|
|
|
|
for token in sent: |
|
|
|
|
|
if token.pos_ in ('SPACE', 'PUNCT', 'X'): |
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
if state in words: |
|
|
|
|
|
words[state].append(word) |
|
|
|
|
|
else: |
|
|
|
|
|
words[state] = [word] |
|
|
|
|
|
|
|
|
cnt += 1 |
|
|
|
|
|
|
|
|
state = f'{token.tag_},{token.dep_},{str(cnt)}' |
|
|
|
|
|
|
|
|
word = token.text |
|
|
|
|
|
|
|
|
if state in words: |
|
|
|
|
|
words[state].append(word) |
|
|
|
|
|
else: |
|
|
|
|
|
words[state] = [word] |
|
|
|
|
|
|
|
|
state = f'{token.tag_},{token.dep_}' |
|
|
|
|
|
|
|
|
|
|
|
if state in words: |
|
|
|
|
|
words[state].append(word) |
|
|
|
|
|
else: |
|
|
|
|
|
words[state] = [word] |
|
|
|
|
|
|
|
|
pos_text = '' |
|
|
|
|
|
with open(args.pos_file, mode='r') as f: |
|
|
|
|
|
pos_text = f.read() |
|
|
|
|
|
|
|
|
state = f'{token.tag_},{token.dep_},{str(cnt)}' |
|
|
|
|
|
|
|
|
pos_doc = nlp(pos_text) |
|
|
|
|
|
|
|
|
if state in words: |
|
|
|
|
|
words[state].append(word) |
|
|
|
|
|
else: |
|
|
|
|
|
words[state] = [word] |
|
|
|
|
|
|
|
|
edges = [] |
|
|
|
|
|
for sent in pos_doc.sents: |
|
|
|
|
|
curr_state = 'START' |
|
|
|
|
|
|
|
|
for sent in doc.sents: |
|
|
|
|
|
curr_state = 'START' |
|
|
|
|
|
|
|
|
cnt = 0 |
|
|
|
|
|
for token in sent: |
|
|
|
|
|
if token.pos_ in ('SPACE', 'PUNCT', 'X'): |
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
cnt = 0 |
|
|
|
|
|
for token in sent: |
|
|
|
|
|
if token.pos_ in ('SPACE', 'PUNCT', 'X'): |
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
cnt += 1 |
|
|
|
|
|
next_state = f'{token.tag_},{token.dep_},{str(cnt)}' |
|
|
|
|
|
|
|
|
cnt += 1 |
|
|
|
|
|
next_state = f'{token.tag_},{token.dep_},{str(cnt)}' |
|
|
|
|
|
|
|
|
edges.append((curr_state, next_state)) |
|
|
|
|
|
|
|
|
edges.append((curr_state, next_state)) |
|
|
|
|
|
|
|
|
curr_state = next_state |
|
|
|
|
|
|
|
|
curr_state = next_state |
|
|
|
|
|
|
|
|
edges.append((curr_state, 'STOP')) |
|
|
|
|
|
|
|
|
edges.append((curr_state, 'STOP')) |
|
|
|
|
|
|
|
|
transitions = {} |
|
|
transitions = {} |
|
|
for edge in edges: |
|
|
for edge in edges: |
|
|