Skip to main content
mySpellChecker’s plugin architecture lets you replace or extend every major component — segmenters, dictionary providers, validation strategies, and POS taggers — by implementing the corresponding base class and passing your instance to SpellChecker.

Custom Segmenter

If you have a proprietary segmentation model or want to use a different library:
  1. Subclass Segmenter.
  2. Implement segment_syllables and segment_words.
  3. Pass it to SpellChecker.
from myspellchecker.segmenters import Segmenter
from typing import List

class SpaceSegmenter(Segmenter):
    """Custom segmenter that uses spaces as word boundaries."""

    def segment_syllables(self, text: str) -> List[str]:
        # Your syllable segmentation logic here
        return list(text)

    def segment_words(self, text: str) -> List[str]:
        return text.split(" ")

# Use the custom segmenter
checker = SpellChecker(segmenter=SpaceSegmenter())

BiLSTM Segmenter Example

from myspellchecker.segmenters import Segmenter
import torch

class BiLSTMSegmenter(Segmenter):
    """Segmenter using a trained BiLSTM model."""

    def __init__(self, model_path: str):
        self.model = torch.load(model_path)

    def segment_syllables(self, text: str) -> List[str]:
        with torch.no_grad():
            # Your model inference logic
            predictions = self.model(text)
            return self._decode_predictions(predictions)

    def segment_words(self, text: str) -> List[str]:
        syllables = self.segment_syllables(text)
        # Group syllables into words based on your model
        return self._group_into_words(syllables)

Custom Dictionary Provider

To load data from a Redis cache, API, or other source:
  1. Subclass DictionaryProvider.
  2. Implement required abstract methods.

Redis Provider Example

from myspellchecker.providers import DictionaryProvider
from typing import List, Optional, Tuple
import redis

class RedisProvider(DictionaryProvider):
    """Dictionary provider backed by Redis cache."""

    def __init__(self, redis_client, prefix: str = "spell:"):
        self.redis = redis_client
        self.prefix = prefix

    def is_valid_word(self, word: str) -> bool:
        return self.redis.exists(f"{self.prefix}word:{word}")

    def is_valid_syllable(self, syllable: str) -> bool:
        return self.redis.exists(f"{self.prefix}syl:{syllable}")

    def get_word_frequency(self, word: str) -> int:
        freq = self.redis.get(f"{self.prefix}freq:{word}")
        return int(freq) if freq else 0

    def get_bigram_probability(self, word1: str, word2: str) -> float:
        prob = self.redis.get(f"{self.prefix}bigram:{word1}:{word2}")
        return float(prob) if prob else 0.0

    def get_trigram_probability(self, word1: str, word2: str, word3: str) -> float:
        prob = self.redis.get(f"{self.prefix}trigram:{word1}:{word2}:{word3}")
        return float(prob) if prob else 0.0

    def get_top_continuations(self, prev_word: str, limit: int = 20) -> List[Tuple[str, float]]:
        # Use Redis SCAN for prefix matching
        cursor = 0
        results = []
        while len(results) < limit:
            cursor, keys = self.redis.scan(
                cursor, f"{self.prefix}word:{prefix}*", count=100
            )
            for key in keys:
                word = key.decode().replace(f"{self.prefix}word:", "")
                freq = self.get_word_frequency(word)
                results.append((word, freq))
            if cursor == 0:
                break
        return sorted(results, key=lambda x: -x[1])[:limit]

    def close(self) -> None:
        self.redis.close()

# Usage
redis_client = redis.Redis(host='localhost', port=6379, db=0)
provider = RedisProvider(redis_client)
checker = SpellChecker(provider=provider)

REST API Provider Example

import requests
from myspellchecker.providers import DictionaryProvider

class APIProvider(DictionaryProvider):
    """Dictionary provider using REST API backend."""

    def __init__(self, base_url: str, api_key: Optional[str] = None):
        self.base_url = base_url.rstrip("/")
        self.session = requests.Session()
        if api_key:
            self.session.headers["Authorization"] = f"Bearer {api_key}"

    def is_valid_word(self, word: str) -> bool:
        response = self.session.get(f"{self.base_url}/words/{word}")
        return response.status_code == 200

    def get_word_frequency(self, word: str) -> int:
        response = self.session.get(f"{self.base_url}/frequency/{word}")
        if response.status_code == 200:
            return response.json().get("frequency", 0)
        return 0

    # ... implement other methods similarly

Custom Validation Strategies

The ContextValidator uses a strategy pattern for extensible validation.

Creating a Custom Strategy

from myspellchecker.core.validation_strategies import ValidationStrategy, ValidationContext
from myspellchecker.core.response import Error
from typing import List

class ProfanityFilterStrategy(ValidationStrategy):
    """Strategy to detect and flag profanity."""

    def __init__(self, blocked_words: List[str]):
        self.blocked_words = set(blocked_words)

    def priority(self) -> int:
        """Lower values run first. Default strategies use 10-70."""
        return 25  # Run after tone validation (10) but before POS (30)

    def validate(self, context: ValidationContext) -> List[Error]:
        errors = []
        for i, word in enumerate(context.words):
            if word.lower() in self.blocked_words:
                errors.append(Error(
                    text=word,
                    position=context.word_positions[i],
                    suggestions=["[redacted]"],
                    error_type="profanity",
                    confidence=1.0,
                ))
        return errors

# Register the strategy
from myspellchecker.core.context_validator import ContextValidator

strategies = [
    ProfanityFilterStrategy(["bad_word1", "bad_word2"]),
    # ... other strategies
]
validator = ContextValidator(config, segmenter, strategies=strategies)

Strategy Priority Guidelines

PriorityCategoryDescription
10ToneTone mark disambiguation
15OrthographyMedial order and compatibility validation
20SyntacticGrammar rule validation
25CustomCustom validation strategies
30POSPart-of-speech sequence validation
40QuestionQuestion particle validation
45HomophoneHomophone confusion detection
50N-gramStatistical context validation
70SemanticAI-powered semantic validation

Factory Pattern Usage

mySpellChecker uses factories for creating configured components.

Using Component Factory

from myspellchecker.core.component_factory import ComponentFactory
from myspellchecker.core.config import SpellCheckerConfig

config = SpellCheckerConfig(
    max_edit_distance=2,
    use_phonetic=True,
    use_context_checker=True,
)

factory = ComponentFactory(config)

# Create individual components
symspell = factory.create_symspell(provider)
components = factory.create_all(provider, segmenter)

POS Tagger Factory

from myspellchecker.algorithms.pos_tagger_factory import POSTaggerFactory
from myspellchecker.core.config import POSTaggerConfig

# Create rule-based tagger (fast, no dependencies)
tagger = POSTaggerFactory.create("rule_based")

# Create Viterbi tagger (better accuracy)
tagger = POSTaggerFactory.create("viterbi", provider=provider)

# Create transformer tagger (best accuracy, requires torch)
tagger = POSTaggerFactory.create(
    "transformer",
    model_name="chuuhtetnaing/myanmar-pos-model",
    device=0  # GPU
)

Advanced Configuration Patterns

Environment-Based Configuration

from myspellchecker.core.config.loader import ConfigLoader

loader = ConfigLoader()

# Load from profile with environment overrides
config = loader.load(
    profile="production",
    use_env=True,  # Read MYSPELL_* environment variables
)

# Available environment variables:
# MYSPELL_MAX_EDIT_DISTANCE=3
# MYSPELL_MAX_SUGGESTIONS=10
# MYSPELL_USE_CONTEXT_CHECKER=true
# MYSPELL_DATABASE_PATH=/path/to/custom.db

Profile-Based Configuration

from myspellchecker.core.config.loader import load_config

# Fast profile - optimized for speed
config = load_config(profile="fast")

# Accurate profile - maximum accuracy
config = load_config(profile="accurate")

# Production profile - balanced
config = load_config(profile="production")

Programmatic Configuration

from myspellchecker.core.config import (
    SpellCheckerConfig,
    SymSpellConfig,
    NgramContextConfig,
    POSTaggerConfig,
)

config = SpellCheckerConfig(
    max_edit_distance=2,
    max_suggestions=10,
    use_phonetic=True,
    use_context_checker=True,
    symspell=SymSpellConfig(
        prefix_length=10,
        beam_width=150,
    ),
    ngram_context=NgramContextConfig(
        bigram_threshold=0.0005,
        trigram_threshold=0.00005,
    ),
    pos_tagger=POSTaggerConfig(
        tagger_type="transformer",
        model_name="chuuhtetnaing/myanmar-pos-model",
    ),
)

checker = SpellChecker(config=config)

Configuration from File

# myspellchecker.yaml
preset: production
max_edit_distance: 2
max_suggestions: 10

symspell:
  prefix_length: 10
  beam_width: 150

ngram_context:
  bigram_threshold: 0.0005
  trigram_threshold: 0.00005

pos_tagger:
  tagger_type: transformer
  model_name: chuuhtetnaing/myanmar-pos-model
from myspellchecker.core.config.loader import load_config

config = load_config(config_file="myspellchecker.yaml")