Generate a novel using Markov chains
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

136 lines
3.6 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. '''
  2. markov.py - Gernerate a novel using Markov chains
  3. Copyright (C) 2020 Blink The Things
  4. This program is free software: you can redistribute it and/or modify
  5. it under the terms of the GNU Affero General Public License as published by
  6. the Free Software Foundation, either version 3 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU Affero General Public License for more details.
  12. You should have received a copy of the GNU Affero General Public License
  13. along with this program. If not, see <https://www.gnu.org/licenses/>.
  14. '''
  15. import argparse
  16. import numpy as np
  17. import os
  18. import spacy
  19. parser = argparse.ArgumentParser(description='Generate a novel using Markov chains.')
  20. parser.add_argument('word_file', help='file used for word selection')
  21. parser.add_argument('pos_file', help='file used to build part-of-speech Markov chain')
  22. parser.add_argument('-s', '--seed', type=int, help='seed for random number generator')
  23. args = parser.parse_args()
  24. nlp = spacy.load('en_core_web_sm')
  25. seed = args.seed or 12345
  26. rng = np.random.default_rng(seed)
  27. words_text = ''
  28. with open(args.word_file, mode='r') as f:
  29. words_text = f.read()
  30. words_doc = nlp(words_text)
  31. words = {}
  32. for sent in words_doc.sents:
  33. cnt = 0
  34. for token in sent:
  35. if token.pos_ in ('SPACE', 'PUNCT', 'X'):
  36. continue
  37. cnt += 1
  38. word = token.text
  39. state = f'{token.tag_},{token.dep_}'
  40. if state in words:
  41. words[state].append(word)
  42. else:
  43. words[state] = [word]
  44. state = f'{token.tag_},{token.dep_},{str(cnt)}'
  45. if state in words:
  46. words[state].append(word)
  47. else:
  48. words[state] = [word]
  49. pos_text = ''
  50. with open(args.pos_file, mode='r') as f:
  51. pos_text = f.read()
  52. pos_doc = nlp(pos_text)
  53. edges = []
  54. for sent in pos_doc.sents:
  55. curr_state = 'START'
  56. cnt = 0
  57. for token in sent:
  58. if token.pos_ in ('SPACE', 'PUNCT', 'X'):
  59. continue
  60. cnt += 1
  61. next_state = f'{token.tag_},{token.dep_},{str(cnt)}'
  62. edges.append((curr_state, next_state))
  63. curr_state = next_state
  64. edges.append((curr_state, 'STOP'))
  65. transitions = {}
  66. for edge in edges:
  67. if edge[0] in transitions:
  68. transitions[edge[0]]['cnt'] += 1
  69. if edge[1] in transitions[edge[0]]['to']:
  70. transitions[edge[0]]['to'][edge[1]] += 1
  71. else:
  72. transitions[edge[0]]['to'][edge[1]] = 1
  73. else:
  74. transitions[edge[0]] = { 'cnt': 1, 'to': {edge[1]: 1}}
  75. chain = {}
  76. for key in transitions.keys():
  77. cnt = transitions[key]['cnt']
  78. choices = list(transitions[key]['to'])
  79. probs = []
  80. for choice in choices:
  81. probs.append(transitions[key]['to'][choice] / cnt)
  82. chain[key] = { 'choices': choices, 'probs': probs}
  83. sents = []
  84. for _ in range(10):
  85. choice = 'START'
  86. choices = []
  87. while True:
  88. next_choice = rng.choice(chain[choice]['choices'], p=chain[choice]['probs'])
  89. if choice == 'START' and next_choice == 'STOP':
  90. continue
  91. if next_choice == 'STOP':
  92. sents.append(' '.join(choices))
  93. break
  94. try:
  95. word = rng.choice(words[next_choice])
  96. except KeyError:
  97. word = rng.choice(words[','.join(next_choice.split(',')[:-1])])
  98. choices.append(word)
  99. choice = next_choice
  100. print(os.linesep.join(sents))