From d258640ab315e7f598415a6d0047f36364356dd6 Mon Sep 17 00:00:00 2001 From: Blink The Things Date: Tue, 3 Nov 2020 12:20:10 -0500 Subject: [PATCH] Add syntactic dependency and word position to state --- markov.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/markov.py b/markov.py index 40a9266..b46a9de 100644 --- a/markov.py +++ b/markov.py @@ -41,18 +41,30 @@ words_doc = nlp(words_text) words = {} for sent in words_doc.sents: + cnt = 0 for token in sent: if token.pos_ in ('SPACE', 'PUNCT', 'X'): continue - state = token.tag_ + cnt += 1 + word = token.text + state = f'{token.tag_},{token.dep_}' + if state in words: words[state].append(word) else: words[state] = [word] + state = f'{token.tag_},{token.dep_},{str(cnt)}' + + if state in words: + words[state].append(word) + else: + words[state] = [word] + + pos_text = '' with open(args.pos_file, mode='r') as f: pos_text = f.read() @@ -63,11 +75,13 @@ edges = [] for sent in pos_doc.sents: curr_state = 'START' + cnt = 0 for token in sent: if token.pos_ in ('SPACE', 'PUNCT', 'X'): continue - next_state = token.tag_ + cnt += 1 + next_state = f'{token.tag_},{token.dep_},{str(cnt)}' edges.append((curr_state, next_state)) @@ -110,7 +124,10 @@ for _ in range(10): sents.append(' '.join(choices)) break - word = rng.choice(words[next_choice]) + try: + word = rng.choice(words[next_choice]) + except KeyError: + word = rng.choice(words[','.join(next_choice.split(',')[:-1])]) choices.append(word)