9 months ago · ff3a4dc78c
--- a/markov.py
+++ b/markov.py
@@ -1,7 +1,9 @@
 
				 import os.path
			
 
				 import re
			
 
				 import atexit
			
 
				+import string
			
 
				 
			
 
				+import spacy
			
 
				 import ujson
			
 
				 import markovify
			
 
				 
			
@@ -15,6 +17,8 @@ class Markov:
 
				         self.corpus = []
			
 
				         self.chain = None
			
 
				 
			
 
				+        self.nlp = spacy.load("xx_sent_ud_sm")
			
 
				+
			
 
				         self.load()
			
 
				 
			
 
				         atexit.register(self.save)
			
@@ -28,11 +32,14 @@ class Markov:
 
				         if not words:
			
 
				             return self.generate()
			
 
				 
			
 
				-        text = " ".join(words)
			
 
				-        text = re.sub(r"(?:^| )?((\!\?\?)|(\!\?)|(\?\!\!)|(\?\?\!)|(\?\!)|(\?\.\.)|(\.{2,})|(\!{2,})|(\?{2,})|([.?!,:;\(\)\"'\$\+\-–—…]))(?: |$)", r"\1 ", text)
			
 
				-        text = text.strip()
			
 
				+        text = ""
			
 
				+        for word in words:
			
 
				+            if word in "-–—" or not all(c in string.punctuation for c in word):
			
 
				+                text += " "
			
 
				 
			
 
				-        return text
			
 
				+            text += word
			
 
				+
			
 
				+        return text.strip()
			
 
				 
			
 
				     def rebuild(self):
			
 
				         self.chain = markovify.Chain(self.corpus, config.MARKOV_STATE_SIZE).compile()
			
@@ -45,15 +52,18 @@ class Markov:
 
				             return
			
 
				 
			
 
				         if "\n" in text:
			
 
				-          for line in text.split("\n"):
			
 
				-            self.extend_corpus(line)
			
 
				+            for line in text.split("\n"):
			
 
				+                self.extend_corpus(line)
			
 
				 
			
 
				-          return
			
 
				+            return
			
 
				 
			
 
				         text = re.sub(r"(@[A-Za-z0-9_]+,?)", "", text)
			
 
				-        text = re.sub("https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)", "", text)
			
 
				-        text = re.sub(r"((\!\?\?)|(\!\?)|(\?\!\!)|(\?\?\!)|(\?\!)|(\?\.\.)|(\.{2,})|(\!{2,})|(\?{2,})|[.?!,:;\(\)\"'\$\+\-–—…])", r" \1 ", text)
			
 
				-        text = text.split(" ")
			
 
				+        text = re.sub(
			
 
				+            "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)",
			
 
				+            "",
			
 
				+            text,
			
 
				+        )
			
 
				+        text = list(self.nlp(text))
			
 
				         text = map(lambda word: word.strip(), text)
			
 
				         text = filter(bool, text)
			
 
				         text = list(text)
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,4 +6,5 @@ aiofiles
 
				 telethon
			
 
				 emoji
			
 
				 markovify
			
 
				-ujson
			
 
				+ujson
			
 
				+spacy