|
|
- '''
- markov.py - Gernerate a novel using Markov chains
- Copyright (C) 2020 Blink The Things
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <https://www.gnu.org/licenses/>.
- '''
-
- import argparse
- import numpy as np
- import os
- import spacy
-
- parser = argparse.ArgumentParser(description='Generate a novel using Markov chains.')
- parser.add_argument('word_file', help='file used for word selection')
- parser.add_argument('pos_file', help='file used to build part-of-speech Markov chain')
- parser.add_argument('-s', '--seed', type=int, help='seed for random number generator')
- args = parser.parse_args()
-
- nlp = spacy.load('en_core_web_sm')
-
- seed = args.seed or 12345
-
- rng = np.random.default_rng(seed)
-
- words_text = ''
- with open(args.word_file, mode='r') as f:
- words_text = f.read()
-
- words_doc = nlp(words_text)
-
- words = {}
- for sent in words_doc.sents:
- cnt = 0
- for token in sent:
- if token.pos_ in ('SPACE', 'PUNCT', 'X'):
- continue
-
- cnt += 1
-
- word = token.text
-
- state = f'{token.tag_},{token.dep_}'
-
- if state in words:
- words[state].append(word)
- else:
- words[state] = [word]
-
- state = f'{token.tag_},{token.dep_},{str(cnt)}'
-
- if state in words:
- words[state].append(word)
- else:
- words[state] = [word]
-
-
- pos_text = ''
- with open(args.pos_file, mode='r') as f:
- pos_text = f.read()
-
- pos_doc = nlp(pos_text)
-
- edges = []
- for sent in pos_doc.sents:
- curr_state = 'START'
-
- cnt = 0
- for token in sent:
- if token.pos_ in ('SPACE', 'PUNCT', 'X'):
- continue
-
- cnt += 1
- next_state = f'{token.tag_},{token.dep_},{str(cnt)}'
-
- edges.append((curr_state, next_state))
-
- curr_state = next_state
-
- edges.append((curr_state, 'STOP'))
-
- transitions = {}
- for edge in edges:
- if edge[0] in transitions:
- transitions[edge[0]]['cnt'] += 1
- if edge[1] in transitions[edge[0]]['to']:
- transitions[edge[0]]['to'][edge[1]] += 1
- else:
- transitions[edge[0]]['to'][edge[1]] = 1
- else:
- transitions[edge[0]] = { 'cnt': 1, 'to': {edge[1]: 1}}
-
- chain = {}
- for key in transitions.keys():
- cnt = transitions[key]['cnt']
- choices = list(transitions[key]['to'])
- probs = []
- for choice in choices:
- probs.append(transitions[key]['to'][choice] / cnt)
- chain[key] = { 'choices': choices, 'probs': probs}
-
- sents = []
- for _ in range(10):
- choice = 'START'
-
- choices = []
- while True:
- next_choice = rng.choice(chain[choice]['choices'], p=chain[choice]['probs'])
-
- if choice == 'START' and next_choice == 'STOP':
- continue
-
- if next_choice == 'STOP':
- sents.append(' '.join(choices))
- break
-
- try:
- word = rng.choice(words[next_choice])
- except KeyError:
- word = rng.choice(words[','.join(next_choice.split(',')[:-1])])
-
- choices.append(word)
-
- choice = next_choice
-
- print(os.linesep.join(sents))
|