|
@@ -1,7 +1,9 @@
|
|
|
import os.path
|
|
|
import re
|
|
|
import atexit
|
|
|
+import string
|
|
|
|
|
|
+import spacy
|
|
|
import ujson
|
|
|
import markovify
|
|
|
|
|
@@ -15,6 +17,8 @@ class Markov:
|
|
|
self.corpus = []
|
|
|
self.chain = None
|
|
|
|
|
|
+ self.nlp = spacy.load("xx_sent_ud_sm")
|
|
|
+
|
|
|
self.load()
|
|
|
|
|
|
atexit.register(self.save)
|
|
@@ -28,11 +32,14 @@ class Markov:
|
|
|
if not words:
|
|
|
return self.generate()
|
|
|
|
|
|
- text = " ".join(words)
|
|
|
- text = re.sub(r"(?:^| )?((\!\?\?)|(\!\?)|(\?\!\!)|(\?\?\!)|(\?\!)|(\?\.\.)|(\.{2,})|(\!{2,})|(\?{2,})|([.?!,:;\(\)\"'\$\+\-–—…]))(?: |$)", r"\1 ", text)
|
|
|
- text = text.strip()
|
|
|
+ text = ""
|
|
|
+ for word in words:
|
|
|
+ if word in "-–—" or not all(c in string.punctuation for c in word):
|
|
|
+ text += " "
|
|
|
|
|
|
- return text
|
|
|
+ text += word
|
|
|
+
|
|
|
+ return text.strip()
|
|
|
|
|
|
def rebuild(self):
|
|
|
self.chain = markovify.Chain(self.corpus, config.MARKOV_STATE_SIZE).compile()
|
|
@@ -45,15 +52,18 @@ class Markov:
|
|
|
return
|
|
|
|
|
|
if "\n" in text:
|
|
|
- for line in text.split("\n"):
|
|
|
- self.extend_corpus(line)
|
|
|
+ for line in text.split("\n"):
|
|
|
+ self.extend_corpus(line)
|
|
|
|
|
|
- return
|
|
|
+ return
|
|
|
|
|
|
text = re.sub(r"(@[A-Za-z0-9_]+,?)", "", text)
|
|
|
- text = re.sub("https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)", "", text)
|
|
|
- text = re.sub(r"((\!\?\?)|(\!\?)|(\?\!\!)|(\?\?\!)|(\?\!)|(\?\.\.)|(\.{2,})|(\!{2,})|(\?{2,})|[.?!,:;\(\)\"'\$\+\-–—…])", r" \1 ", text)
|
|
|
- text = text.split(" ")
|
|
|
+ text = re.sub(
|
|
|
+ "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)",
|
|
|
+ "",
|
|
|
+ text,
|
|
|
+ )
|
|
|
+ text = list(self.nlp(text))
|
|
|
text = map(lambda word: word.strip(), text)
|
|
|
text = filter(bool, text)
|
|
|
text = list(text)
|