markov.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. import os.path
  2. import re
  3. import atexit
  4. import string
  5. import spacy
  6. import ujson
  7. import markovify
  8. from config import config
  9. class Markov:
  10. def __init__(self):
  11. self.counter = 0
  12. self.corpus = []
  13. self.chain = None
  14. self.nlp = spacy.load("xx_sent_ud_sm")
  15. self.load()
  16. atexit.register(self.save)
  17. @property
  18. def is_ready(self):
  19. return self.chain is not None
  20. def generate(self):
  21. words = self.chain.walk()
  22. if not words:
  23. return self.generate()
  24. text = ""
  25. for word in words:
  26. if word in "-–—" or not all(c in string.punctuation for c in word):
  27. text += " "
  28. text += word
  29. return text.strip()
  30. def rebuild(self):
  31. self.chain = markovify.Chain(self.corpus, config.MARKOV_STATE_SIZE).compile()
  32. self.counter = 0
  33. def extend_corpus(self, text):
  34. text = text.strip()
  35. if not text:
  36. return
  37. if "\n" in text:
  38. for line in text.split("\n"):
  39. self.extend_corpus(line)
  40. return
  41. text = re.sub(r"(@[A-Za-z0-9_]+,?)", "", text)
  42. text = re.sub(
  43. "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)",
  44. "",
  45. text,
  46. )
  47. text = list(self.nlp(text))
  48. text = map(lambda word: word.strip(), text)
  49. text = filter(bool, text)
  50. text = list(text)
  51. if text not in self.corpus:
  52. self.corpus.insert(0, text)
  53. if len(self.corpus) > config.MARKOV_CORPUS_SIZE:
  54. self.corpus.pop(-1)
  55. self.counter += 1
  56. if self.counter % config.MARKOV_REBUILD_RATE == 0:
  57. self.rebuild()
  58. def load(self):
  59. if os.path.isfile(config.MARKOV_CHAIN_PATH):
  60. with open(config.MARKOV_CHAIN_PATH, "r") as f:
  61. self.chain = markovify.Chain.from_json(f.read())
  62. if os.path.isfile(config.MARKOV_CORPUS_PATH):
  63. with open(config.MARKOV_CORPUS_PATH, "r") as f:
  64. self.corpus = ujson.load(f)
  65. def save(self):
  66. if self.chain:
  67. with open(config.MARKOV_CHAIN_PATH, "w") as f:
  68. f.write(self.chain.to_json())
  69. with open(config.MARKOV_CORPUS_PATH, "w") as f:
  70. ujson.dump(self.corpus, f)