Generate a novel using Markov chains
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

157 lines
4.5 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. '''
  2. markov.py - Generate a novel using Markov chains
  3. Copyright (C) 2020 Blink The Things
  4. This program is free software: you can redistribute it and/or modify
  5. it under the terms of the GNU Affero General Public License as published by
  6. the Free Software Foundation, either version 3 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU Affero General Public License for more details.
  12. You should have received a copy of the GNU Affero General Public License
  13. along with this program. If not, see <https://www.gnu.org/licenses/>.
  14. '''
  15. import argparse
  16. import numpy as np
  17. import os
  18. import spacy
  19. import sys
  20. from textwrap import fill
  21. parser = argparse.ArgumentParser(description='Generate a novel using Markov chains.')
  22. parser.add_argument('input', nargs='+', help='used to construct Markov transition matrix')
  23. parser.add_argument('-w','--words', type=int, help='generate at least WORDS words')
  24. parser.add_argument('-s', '--seed', type=int, help='seed for random number generator')
  25. args = parser.parse_args()
  26. nlp = spacy.load('en_core_web_sm')
  27. rng = np.random.default_rng(args.seed or 12345)
  28. word_cnt = args.words or 100
  29. words = {}
  30. edges = []
  31. input_text = ''
  32. for infile in args.input:
  33. with open(infile, mode='r') as f:
  34. input_text = f.read()
  35. i = 1000000
  36. if len(input_text) > i:
  37. while input_text[i] != ' ':
  38. i -= 1
  39. doc = nlp(input_text[:i])
  40. for sent in doc.sents:
  41. cnt = 0
  42. for token in sent:
  43. if token.pos_ in ('SPACE', 'PUNCT', 'X'):
  44. continue
  45. cnt += 1
  46. word = token.text
  47. state = f'{token.tag_},{token.dep_}'
  48. if state in words:
  49. words[state].append(word)
  50. else:
  51. words[state] = [word]
  52. state = f'{token.tag_},{token.dep_}'
  53. if state in words:
  54. words[state].append(word)
  55. else:
  56. words[state] = [word]
  57. for sent in doc.sents:
  58. curr_state = 'START'
  59. cnt = 0
  60. for token in sent:
  61. if token.pos_ in ('SPACE', 'PUNCT', 'X'):
  62. continue
  63. cnt += 1
  64. next_state = f'{token.tag_},{token.dep_}'
  65. edges.append((curr_state, next_state))
  66. curr_state = next_state
  67. edges.append((curr_state, 'STOP'))
  68. transitions = {}
  69. for edge in edges:
  70. if edge[0] in transitions:
  71. transitions[edge[0]]['cnt'] += 1
  72. if edge[1] in transitions[edge[0]]['to']:
  73. transitions[edge[0]]['to'][edge[1]] += 1
  74. else:
  75. transitions[edge[0]]['to'][edge[1]] = 1
  76. else:
  77. transitions[edge[0]] = { 'cnt': 1, 'to': {edge[1]: 1}}
  78. chain = {}
  79. for key in transitions.keys():
  80. cnt = transitions[key]['cnt']
  81. choices = list(transitions[key]['to'])
  82. probs = []
  83. for choice in choices:
  84. probs.append(transitions[key]['to'][choice] / cnt)
  85. chain[key] = { 'choices': choices, 'probs': probs}
  86. sents = []
  87. paragraphs = []
  88. paragraph_sent_cnt = rng.integers(5, 10)
  89. while word_cnt > 0:
  90. choice = 'START'
  91. choices = []
  92. while True:
  93. next_choice = rng.choice(chain[choice]['choices'], p=chain[choice]['probs'])
  94. if choice == 'START' and next_choice == 'STOP':
  95. continue
  96. if next_choice == 'STOP':
  97. sents.append(' '.join(choices)
  98. .replace(" '", "'")
  99. .replace("", "")
  100. .replace(" `", "`")
  101. + '. '
  102. )
  103. paragraph_sent_cnt -= 1
  104. if paragraph_sent_cnt < 0:
  105. paragraphs.append(fill(''.join(sents), replace_whitespace=False, drop_whitespace=False))
  106. sents = []
  107. paragraph_sent_cnt = rng.integers(5, 10)
  108. break
  109. try:
  110. word = rng.choice(words[next_choice])
  111. except KeyError:
  112. word = rng.choice(words[','.join(next_choice.split(',')[:-1])])
  113. if choice == 'START' or word == 'i':
  114. word = str.title(word)
  115. elif not (next_choice.startswith('PROPN') or word == 'I'):
  116. word = str.lower(word)
  117. choices.append(word)
  118. word_cnt -= 1
  119. choice = next_choice
  120. print(f'{os.linesep}{os.linesep}'.join(paragraphs))