Kaynağa Gözat

use spacy for tokenization

txlyre 1 ay önce
ebeveyn
işleme
ff3a4dc78c
2 değiştirilmiş dosya ile 22 ekleme ve 11 silme
  1. 20 10
      markov.py
  2. 2 1
      requirements.txt

+ 20 - 10
markov.py

@@ -1,7 +1,9 @@
 import os.path
 import re
 import atexit
+import string
 
+import spacy
 import ujson
 import markovify
 
@@ -15,6 +17,8 @@ class Markov:
         self.corpus = []
         self.chain = None
 
+        self.nlp = spacy.load("xx_sent_ud_sm")
+
         self.load()
 
         atexit.register(self.save)
@@ -28,11 +32,14 @@ class Markov:
         if not words:
             return self.generate()
 
-        text = " ".join(words)
-        text = re.sub(r"(?:^| )?((\!\?\?)|(\!\?)|(\?\!\!)|(\?\?\!)|(\?\!)|(\?\.\.)|(\.{2,})|(\!{2,})|(\?{2,})|([.?!,:;\(\)\"'\$\+\-–—…]))(?: |$)", r"\1 ", text)
-        text = text.strip()
+        text = ""
+        for word in words:
+            if word in "-–—" or not all(c in string.punctuation for c in word):
+                text += " "
 
-        return text
+            text += word
+
+        return text.strip()
 
     def rebuild(self):
         self.chain = markovify.Chain(self.corpus, config.MARKOV_STATE_SIZE).compile()
@@ -45,15 +52,18 @@ class Markov:
             return
 
         if "\n" in text:
-          for line in text.split("\n"):
-            self.extend_corpus(line)
+            for line in text.split("\n"):
+                self.extend_corpus(line)
 
-          return
+            return
 
         text = re.sub(r"(@[A-Za-z0-9_]+,?)", "", text)
-        text = re.sub("https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)", "", text)
-        text = re.sub(r"((\!\?\?)|(\!\?)|(\?\!\!)|(\?\?\!)|(\?\!)|(\?\.\.)|(\.{2,})|(\!{2,})|(\?{2,})|[.?!,:;\(\)\"'\$\+\-–—…])", r" \1 ", text)
-        text = text.split(" ")
+        text = re.sub(
+            "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)",
+            "",
+            text,
+        )
+        text = list(self.nlp(text))
         text = map(lambda word: word.strip(), text)
         text = filter(bool, text)
         text = list(text)

+ 2 - 1
requirements.txt

@@ -6,4 +6,5 @@ aiofiles
 telethon
 emoji
 markovify
-ujson
+ujson
+spacy