txlyre
/
openKriemy


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
							import os.path
import re
import atexit
import string
import threading

import spacy
import ujson
import markovify

from config import config


class Markov:
    def __init__(self):
        self.counter = 0

        self.corpus = []
        self.chain = None

        self.nlp = spacy.load("xx_sent_ud_sm")

        self.load()

        atexit.register(self.save)

    @property
    def is_ready(self):
        return self.chain is not None

    def generate(self, init_state=None):
        orig_init_state = init_state

        if init_state is not None:
            init_state = self.tokenize(init_state)
            init_state = tuple(init_state)

            size = len(init_state)

            if size < config.MARKOV_STATE_SIZE:
                init_state = (markovify.chain.BEGIN,) * (
                    config.MARKOV_STATE_SIZE - size
                ) + init_state
            elif size > config.MARKOV_STATE_SIZE:
                init_state = init_state[: -config.MARKOV_STATE_SIZE]

        words = self.chain.walk(init_state)
        if not words:
            return self.generate(init_state)

        text = orig_init_state if orig_init_state is not None else ""
        for word in words:
            if word in "-–—" or not all(
                c in string.punctuation or c == "…" for c in word
            ):
                text += " "

            text += word

        return text.strip()

    def _rebuild(self):
        self.chain = markovify.Chain(self.corpus, config.MARKOV_STATE_SIZE).compile()

    def rebuild(self):
        self.counter = 0

        t = threading.Thread(target=self._rebuild)
        t.start()

    def tokenize(self, text):
        text = re.sub(r"(@[A-Za-z0-9_]+,?)", "", text)
        text = re.sub(
            "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)",
            "",
            text,
        )
        text = self.nlp(text)
        text = map(lambda word: str(word).strip(), text)
        text = filter(bool, text)

        return list(text)

    def extend_corpus(self, text):
        text = text.strip()
        if not text:
            return

        if "\n" in text:
            for line in text.split("\n"):
                self.extend_corpus(line)

            return

        text = self.tokenize(text)

        if text not in self.corpus:
            self.corpus.insert(0, text)

        if (
            config.MARKOV_CORPUS_SIZE > 0
            and len(self.corpus) > config.MARKOV_CORPUS_SIZE
        ):
            self.corpus = self.corpus[: config.MARKOV_CORPUS_SIZE]

        self.counter += 1

        if (
            config.MARKOV_REBUILD_RATE > 0
            and self.counter % config.MARKOV_REBUILD_RATE == 0
        ):
            self.rebuild()

    def load(self):
        if os.path.isfile(config.MARKOV_CHAIN_PATH):
            with open(config.MARKOV_CHAIN_PATH, "r") as f:
                self.chain = markovify.Chain.from_json(f.read())

        if os.path.isfile(config.MARKOV_CORPUS_PATH):
            with open(config.MARKOV_CORPUS_PATH, "r") as f:
                self.corpus = ujson.load(f)

    def save(self):
        if self.chain:
            with open(config.MARKOV_CHAIN_PATH, "w") as f:
                f.write(self.chain.to_json())

        with open(config.MARKOV_CORPUS_PATH, "w") as f:
            ujson.dump(self.corpus, f)