|  | @@ -1,7 +1,9 @@
 | 
	
		
			
				|  |  |  import os.path
 | 
	
		
			
				|  |  |  import re
 | 
	
		
			
				|  |  |  import atexit
 | 
	
		
			
				|  |  | +import string
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +import spacy
 | 
	
		
			
				|  |  |  import ujson
 | 
	
		
			
				|  |  |  import markovify
 | 
	
		
			
				|  |  |  
 | 
	
	
		
			
				|  | @@ -15,6 +17,8 @@ class Markov:
 | 
	
		
			
				|  |  |          self.corpus = []
 | 
	
		
			
				|  |  |          self.chain = None
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +        self.nlp = spacy.load("xx_sent_ud_sm")
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |          self.load()
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |          atexit.register(self.save)
 | 
	
	
		
			
				|  | @@ -28,11 +32,14 @@ class Markov:
 | 
	
		
			
				|  |  |          if not words:
 | 
	
		
			
				|  |  |              return self.generate()
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -        text = " ".join(words)
 | 
	
		
			
				|  |  | -        text = re.sub(r"(?:^| )?((\!\?\?)|(\!\?)|(\?\!\!)|(\?\?\!)|(\?\!)|(\?\.\.)|(\.{2,})|(\!{2,})|(\?{2,})|([.?!,:;\(\)\"'\$\+\-–—…]))(?: |$)", r"\1 ", text)
 | 
	
		
			
				|  |  | -        text = text.strip()
 | 
	
		
			
				|  |  | +        text = ""
 | 
	
		
			
				|  |  | +        for word in words:
 | 
	
		
			
				|  |  | +            if word in "-–—" or not all(c in string.punctuation for c in word):
 | 
	
		
			
				|  |  | +                text += " "
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -        return text
 | 
	
		
			
				|  |  | +            text += word
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        return text.strip()
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      def rebuild(self):
 | 
	
		
			
				|  |  |          self.chain = markovify.Chain(self.corpus, config.MARKOV_STATE_SIZE).compile()
 | 
	
	
		
			
				|  | @@ -45,15 +52,18 @@ class Markov:
 | 
	
		
			
				|  |  |              return
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |          if "\n" in text:
 | 
	
		
			
				|  |  | -          for line in text.split("\n"):
 | 
	
		
			
				|  |  | -            self.extend_corpus(line)
 | 
	
		
			
				|  |  | +            for line in text.split("\n"):
 | 
	
		
			
				|  |  | +                self.extend_corpus(line)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -          return
 | 
	
		
			
				|  |  | +            return
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |          text = re.sub(r"(@[A-Za-z0-9_]+,?)", "", text)
 | 
	
		
			
				|  |  | -        text = re.sub("https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)", "", text)
 | 
	
		
			
				|  |  | -        text = re.sub(r"((\!\?\?)|(\!\?)|(\?\!\!)|(\?\?\!)|(\?\!)|(\?\.\.)|(\.{2,})|(\!{2,})|(\?{2,})|[.?!,:;\(\)\"'\$\+\-–—…])", r" \1 ", text)
 | 
	
		
			
				|  |  | -        text = text.split(" ")
 | 
	
		
			
				|  |  | +        text = re.sub(
 | 
	
		
			
				|  |  | +            "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)",
 | 
	
		
			
				|  |  | +            "",
 | 
	
		
			
				|  |  | +            text,
 | 
	
		
			
				|  |  | +        )
 | 
	
		
			
				|  |  | +        text = list(self.nlp(text))
 | 
	
		
			
				|  |  |          text = map(lambda word: word.strip(), text)
 | 
	
		
			
				|  |  |          text = filter(bool, text)
 | 
	
		
			
				|  |  |          text = list(text)
 |