Skip to main content
AlgorithmFactory creates configured algorithm instances (SymSpell, N-gram checker, phonetic matcher) with built-in LRU caching for 10-100x speedup on repeated lookups. It’s used internally by SpellChecker but can be accessed directly for advanced use cases.

Overview

from myspellchecker.algorithms.factory import AlgorithmFactory
from myspellchecker.providers import SQLiteProvider

# Create factory with provider
provider = SQLiteProvider("mydict.db")
factory = AlgorithmFactory(provider)

# Create algorithms with built-in caching
symspell = factory.create_symspell()
ngram_checker = factory.create_ngram_checker()

AlgorithmFactory Class

Central factory for all spell checking algorithms:
class AlgorithmFactory:
    """Factory for creating spell checking algorithms with caching.

    Provides centralized creation of algorithm instances with:
    - Transparent result caching (10-100x speedup)
    - Consistent configuration
    - Lazy initialization
    - Cache statistics

    Args:
        provider: Dictionary provider for data access
        enable_caching: Enable result caching (default: True)
        cache_sizes: Maximum cache entries per algorithm type (default: None)
        share_caches: Share caches across factory instances with same provider (default: True)
    """

    def __init__(
        self,
        provider: DictionaryProvider,
        enable_caching: bool = True,
        cache_sizes: Optional[Dict[str, int]] = None,
        share_caches: bool = True,
    ):
        self.provider = provider
        self.enable_caching = enable_caching
        self.cache_sizes = cache_sizes

        # Lazy-initialized algorithm instances
        self._symspell: Optional[SymSpell] = None
        self._ngram_checker: Optional[NgramContextChecker] = None
        self._semantic_checker: Optional[SemanticChecker] = None

        # Cache instances
        self._caches: Dict[str, LRUCache] = {}

Factory Methods

create_symspell

Creates a SymSpell instance with dictionary lookup caching:
def create_symspell(
    self,
    config: Optional[SymSpellConfig] = None,
    max_edit_distance: int = 2,
    phonetic_hasher: Optional[Any] = None,
    build_index: bool = True,
) -> SymSpell:
    """Create a SymSpell instance with caching.

    Args:
        config: SymSpellConfig instance (uses defaults if None)
        max_edit_distance: Maximum edit distance for suggestions
        phonetic_hasher: Optional PhoneticHasher for phonetic matching
        build_index: Whether to build the index after creation (default: True)

    Returns:
        Configured SymSpell instance (cached if enable_caching=True)
    """
Usage:
factory = AlgorithmFactory(provider)

# Create with defaults
symspell = factory.create_symspell()

# Create with custom config
from myspellchecker.core.config import SymSpellConfig

symspell = factory.create_symspell(
    config=SymSpellConfig(prefix_length=5, beam_width=100),
    max_edit_distance=3,
)

# Get suggestions (results are cached)
suggestions = symspell.lookup("ကျောင့်")
suggestions = symspell.lookup("ကျောင့်")  # Cache hit!

create_ngram_checker

Creates an N-gram context checker:
def create_ngram_checker(
    self,
    config: Optional[NgramContextConfig] = None,
    symspell: Optional[SymSpell] = None,
) -> NgramContextChecker:
    """Create an N-gram context checker with caching.

    Args:
        config: N-gram context configuration (uses defaults if None)
        symspell: Optional SymSpell instance for integration

    Returns:
        Configured NgramContextChecker instance
    """
Usage:
ngram_checker = factory.create_ngram_checker()

# Check context probability
score = ngram_checker.score_context(["မြန်မာ", "နိုင်ငံ", "သည်"])

# With custom config
from myspellchecker.core.config import NgramContextConfig

ngram_checker = factory.create_ngram_checker(
    config=NgramContextConfig(threshold=0.05),
)

create_semantic_checker

Creates an ONNX-based semantic checker:
def create_semantic_checker(
    self,
    config: Optional[SemanticConfig] = None,
) -> Optional[SemanticChecker]:
    """Create a semantic checker (ONNX-based).

    Args:
        config: Semantic checker configuration (uses defaults if None)

    Returns:
        SemanticChecker instance, or None if model not found
    """
Usage:
from myspellchecker.core.config import SemanticConfig

semantic = factory.create_semantic_checker(
    config=SemanticConfig(
        model_path="models/semantic.onnx",
        tokenizer_path="models/tokenizer.json",
    ),
)

if semantic:
    result = semantic.check("မြန်မာ [MASK] သည်")

Cached Wrappers

CachedDictionaryLookup

Wraps dictionary lookups with LRU caching:
class CachedDictionaryLookup:
    """Cached wrapper for dictionary lookups.

    Caches syllable/word validation and frequency lookups.
    """

    def __init__(
        self,
        provider: DictionaryLookup,
        syllable_cache_size: int = 4096,
        word_cache_size: int = 8192,
        use_lock: bool = False,
    ):
        self._provider = provider
        # Creates instance-specific lru_cache methods for:
        # is_valid_syllable, is_valid_word,
        # get_syllable_frequency, get_word_frequency

    def is_valid_syllable(self, syllable: str) -> bool:
        """Check if syllable exists in dictionary (cached)."""
        ...

    def is_valid_word(self, word: str) -> bool:
        """Check if word exists in dictionary (cached)."""
        ...

    def get_syllable_frequency(self, syllable: str) -> int:
        """Get syllable frequency (cached)."""
        ...

    def get_word_frequency(self, word: str) -> int:
        """Get word frequency (cached)."""
        ...

CachedBigramSource

Wraps bigram lookups with caching:
class CachedBigramSource:
    """Cached wrapper for bigram lookups."""

    def __init__(self, provider: BigramSource, cache_size: int = 16384):
        self._provider = provider
        # Creates instance-specific lru_cache for bigram probabilities

    def get_bigram_probability(self, word1: str, word2: str) -> float:
        """Get bigram probability (cached)."""
        key = (word1, word2)
        if key in self._cache:
            return self._cache[key]

        result = self.provider.get_bigram_probability(word1, word2)
        self._cache[key] = result
        return result

CachedPOSRepository

Wraps POS lookups with caching:
from myspellchecker.algorithms.cache import CachedPOSRepository

# Create cached POS repository (no cache_size param - uses lazy init internally)
cached_pos = CachedPOSRepository(provider=provider)

# Get POS for a word (cached)
pos = cached_pos.get_pos("သွား")

Cache Statistics

get_cache_stats

Get statistics for all caches:
def get_cache_stats(self) -> Dict[str, Dict[str, int]]:
    """Get statistics for all caches.

    Returns:
        Dict mapping cache name to stats dict with:
        - size: Current number of entries
        - hits: Number of cache hits
        - misses: Number of cache misses
        - hit_rate: Hit rate percentage
    """
Usage:
factory = AlgorithmFactory(provider)
symspell = factory.create_symspell()

# Use the algorithm
for word in words:
    symspell.lookup(word)

# Check cache performance
stats = factory.get_cache_stats()
for cache_name, cache_stats in stats.items():
    print(f"{cache_name}:")
    print(f"  Size: {cache_stats['size']}")
    print(f"  Hit rate: {cache_stats['hit_rate']:.1f}%")

clear_caches

Clear all caches:
def clear_caches(self) -> None:
    """Clear all caches.

    Useful when dictionary data changes or for memory management.
    """
    for cache in self._caches.values():
        cache.clear()

Performance Benefits

Speedup by Operation

OperationUncachedCachedSpeedup
Word lookup0.5ms0.005ms100x
Bigram lookup1ms0.01ms100x
Suggestions50ms0.5ms100x
Semantic check200ms2ms100x

Memory Usage

Cache SizeMemoryTypical Coverage
1,000~1MB80% of common words
10,000~10MB95% of common words
100,000~100MB99% of vocabulary

Integration with SpellChecker

The factory integrates with the main SpellChecker:
from myspellchecker import SpellChecker
from myspellchecker.core.config import SpellCheckerConfig, AlgorithmCacheConfig

config = SpellCheckerConfig(
    cache=AlgorithmCacheConfig(
        syllable_cache_size=4096,
        word_cache_size=8192,
        frequency_cache_size=10000,
    ),
)

checker = SpellChecker(config=config)
# Factory is created internally with caching enabled

Manual Factory Usage

from myspellchecker.algorithms.factory import AlgorithmFactory
from myspellchecker.core.validators import WordValidator

# Create factory
factory = AlgorithmFactory(provider, cache_sizes={"dictionary_word": 50000})

# Create cached algorithms
symspell = factory.create_symspell()
ngram = factory.create_ngram_checker()

# These algorithms are used internally by validators
# created through the DI container. For direct usage:
checker = SpellChecker(provider=provider)
result = checker.check("text")

Configuration

Factory Configuration

factory = AlgorithmFactory(
    provider=provider,
    enable_caching=True,     # Enable all caching (default)
    cache_sizes={            # Custom cache sizes per data type (optional)
        "dictionary_syllable": 4096,
        "dictionary_word": 8192,
        "bigram": 16384,
    },
)

Per-Algorithm Configuration

# Different cache sizes per algorithm
symspell = factory.create_symspell()
factory._caches["symspell"].max_size = 20000

ngram = factory.create_ngram_checker()
factory._caches["ngram"].max_size = 5000

Best Practices

1. Reuse Factory Instances

# Good: Single factory for application
factory = AlgorithmFactory(provider)
symspell = factory.create_symspell()
ngram = factory.create_ngram_checker()

# Bad: Multiple factories (no cache sharing)
factory1 = AlgorithmFactory(provider)
factory2 = AlgorithmFactory(provider)

2. Monitor Cache Performance

# Periodically check cache stats
stats = factory.get_cache_stats()
for name, s in stats.items():
    if s["hit_rate"] < 50:
        print(f"Warning: Low hit rate for {name}: {s['hit_rate']}%")

3. Clear Caches When Needed

# Clear provider caches (SQLiteProvider)
provider.clear_caches()

# Clear algorithm factory caches
factory.clear_caches()

# On memory pressure
if memory_usage > threshold:
    factory.clear_caches()

See Also