Core Classes
SpellChecker
The main spell checking class.Copy
Ask AI
from myspellchecker import SpellChecker
class SpellChecker:
"""
Main spell checker class for Myanmar text validation.
Attributes:
config: SpellCheckerConfig instance
provider: Dictionary provider
segmenter: Text segmenter
"""
def __init__(
self,
config: SpellCheckerConfig = None,
segmenter: Segmenter = None,
provider: DictionaryProvider = None,
syllable_validator: SyllableValidator = None,
word_validator: WordValidator = None,
context_validator: ContextValidator = None,
factory: ComponentFactoryProtocol = None,
):
"""
Initialize SpellChecker.
Args:
config: Configuration settings (default: balanced preset)
segmenter: Custom Segmenter for text tokenization (default: DefaultSegmenter)
provider: Dictionary provider (default: SQLiteProvider)
syllable_validator: Custom SyllableValidator (advanced use)
word_validator: Custom WordValidator (advanced use)
context_validator: Custom ContextValidator (advanced use)
factory: Custom ComponentFactory for dependency injection (advanced use)
"""
# --- Factory Methods ---
@classmethod
def create_default(cls) -> "SpellChecker":
"""Create SpellChecker with default settings (balanced performance/accuracy)."""
@classmethod
def create_fast(cls) -> "SpellChecker":
"""Create SpellChecker optimized for speed (disables context checking, NER, phonetic)."""
@classmethod
def create_accurate(cls) -> "SpellChecker":
"""Create SpellChecker optimized for accuracy (higher edit distance, lower thresholds)."""
@classmethod
def create_minimal(cls) -> "SpellChecker":
"""Create SpellChecker with minimal features (basic syllable validation only)."""
@classmethod
def create_strict(cls) -> "SpellChecker":
"""Create SpellChecker with strict error detection (conservative thresholds)."""
# --- Core Methods ---
def check(
self,
text: str,
level: ValidationLevel = ValidationLevel.SYLLABLE,
use_semantic: Optional[bool] = None,
) -> Response:
"""
Check text for spelling errors.
Args:
text: Myanmar text to check
level: Validation level (SYLLABLE or WORD)
use_semantic: Override semantic checking for this call
Returns:
Response containing errors and suggestions
"""
async def check_async(
self,
text: str,
level: ValidationLevel = ValidationLevel.SYLLABLE,
use_semantic: Optional[bool] = None,
) -> Response:
"""
Asynchronously check text for spelling errors.
Runs the CPU-bound check() in a separate thread via asyncio.to_thread().
Args:
text: Myanmar text to check
level: Validation level (SYLLABLE or WORD)
use_semantic: Override semantic checking for this call
Returns:
Response containing errors and suggestions
"""
def check_batch(
self,
texts: list[str],
level: ValidationLevel = ValidationLevel.SYLLABLE,
) -> list[Response]:
"""
Check multiple texts sequentially.
Args:
texts: List of texts to check
level: Validation level (SYLLABLE or WORD)
Returns:
List of Response objects
"""
async def check_batch_async(
self,
texts: list[str],
level: ValidationLevel = ValidationLevel.SYLLABLE,
max_concurrency: int = 4,
) -> list[Response]:
"""
Asynchronously check multiple texts with configurable concurrency.
Args:
texts: List of texts to check
level: Validation level (SYLLABLE or WORD)
max_concurrency: Maximum concurrent operations (default: 4)
Returns:
List of Response objects
"""
def get_pos_tags(self, text: str = "", words: list[str] = None) -> list[str]:
"""
Get the most likely POS tag sequence for text or pre-segmented words.
Args:
text: Input text to tag (optional if words is provided)
words: Pre-segmented words (optional if text is provided)
Returns:
List of POS tags, one per word.
"""
def segment_and_tag(self, text: str) -> tuple[list[str], list[str]]:
"""
Perform joint word segmentation and POS tagging.
Uses joint Viterbi decoder if enabled (config.joint.enabled=True),
otherwise falls back to sequential segmentation then tagging.
Args:
text: Text to segment
Returns:
Tuple of (words, tags)
"""
def get_stemmer(self) -> "Stemmer":
"""Get a Stemmer instance for suffix stripping."""
def close(self) -> None:
"""Close and release resources."""
def __enter__(self) -> "SpellChecker":
"""Context manager entry."""
def __exit__(self, *args) -> None:
"""Context manager exit with cleanup."""
# --- Properties ---
@property
def symspell(self) -> Optional[SymSpell]:
"""Access SymSpell instance for direct suggestion lookups."""
@property
def context_checker(self) -> Optional[NgramContextChecker]:
"""Access NgramContextChecker for N-gram probability lookups."""
@property
def syllable_rule_validator(self) -> Optional[SyllableRuleValidator]:
"""Access SyllableRuleValidator for Myanmar orthographic validation."""
@property
def name_heuristic(self) -> Optional[NameHeuristic]:
"""Access NameHeuristic for proper noun detection."""
@property
def semantic_checker(self) -> Optional[SemanticChecker]:
"""Access SemanticChecker for AI-powered error detection."""
@property
def phonetic_hasher(self) -> Optional[PhoneticHasher]:
"""Access PhoneticHasher for phonetic similarity matching."""
SpellCheckerBuilder
Fluent builder for SpellChecker construction.Copy
Ask AI
from myspellchecker.core import SpellCheckerBuilder
class SpellCheckerBuilder:
"""Fluent builder for SpellChecker instances."""
def with_config(self, config: SpellCheckerConfig) -> "SpellCheckerBuilder":
"""Set the full configuration object."""
def with_provider(self, provider: DictionaryProvider) -> "SpellCheckerBuilder":
"""Set a custom dictionary provider."""
def with_segmenter(self, segmenter: Segmenter) -> "SpellCheckerBuilder":
"""Set a custom text segmenter."""
def with_phonetic(self, enabled: bool = True) -> "SpellCheckerBuilder":
"""Enable or disable phonetic similarity matching."""
def with_context_checking(self, enabled: bool = True) -> "SpellCheckerBuilder":
"""Enable or disable N-gram context checking."""
def with_ner(self, enabled: bool = True) -> "SpellCheckerBuilder":
"""Enable or disable Named Entity Recognition heuristics."""
def with_rule_based_validation(self, enabled: bool = True) -> "SpellCheckerBuilder":
"""Enable or disable rule-based syllable validation."""
def with_max_edit_distance(self, distance: int) -> "SpellCheckerBuilder":
"""Set maximum edit distance for suggestions (1-3)."""
def with_max_suggestions(self, count: int) -> "SpellCheckerBuilder":
"""Set maximum number of suggestions per error."""
def with_symspell_prefix_length(self, length: int) -> "SpellCheckerBuilder":
"""Set SymSpell prefix length for performance optimization (typically 5-10)."""
def with_cache_size(self, size: int) -> "SpellCheckerBuilder":
"""Set provider cache size for memory optimization."""
def with_bigram_threshold(self, threshold: float) -> "SpellCheckerBuilder":
"""Set probability threshold for flagging bigram errors (0.0-1.0)."""
def with_trigram_threshold(self, threshold: float) -> "SpellCheckerBuilder":
"""Set probability threshold for flagging trigram errors (0.0-1.0)."""
def with_semantic_model(
self,
model_path: str = None,
tokenizer_path: str = None,
model: Any = None,
tokenizer: Any = None,
) -> "SpellCheckerBuilder":
"""Configure semantic checking model (paths or pre-loaded instances)."""
def with_word_engine(
self, engine: Literal["myword", "crf", "transformer"]
) -> "SpellCheckerBuilder":
"""Set the word segmentation engine."""
def with_pos_tagger(
self,
tagger_type: Literal["rule_based", "transformer", "viterbi", "custom"] = "rule_based",
model_name: str = None,
device: int = -1,
) -> "SpellCheckerBuilder":
"""Configure the Part-of-Speech tagger."""
def with_joint_segmentation(
self,
enabled: bool = True,
beam_width: int = 15,
) -> "SpellCheckerBuilder":
"""Configure Joint Segmentation and POS Tagging."""
def build(self) -> SpellChecker:
"""Construct SpellChecker with all configured options."""
Copy
Ask AI
from myspellchecker.core.builder import SpellCheckerBuilder
from myspellchecker.providers import SQLiteProvider
# Using custom provider
provider = SQLiteProvider(database_path="/path/to/db.sqlite")
checker = (
SpellCheckerBuilder()
.with_provider(provider)
.with_phonetic(True)
.with_context_checking(True)
.build()
)
ConfigPresets
Pre-configured SpellCheckerConfig instances for common use cases.Copy
Ask AI
from myspellchecker.core.builder import ConfigPresets
# Use a preset directly
checker = SpellChecker(config=ConfigPresets.FAST)
# Customize a preset (each access returns a deep copy, safe to modify)
config = ConfigPresets.ACCURATE
config.max_suggestions = 10
checker = SpellChecker(config=config)
DEFAULT, FAST, ACCURATE, MINIMAL, STRICT.
Configuration Classes
SpellCheckerConfig
Main configuration class (Pydantic BaseModel).Copy
Ask AI
from myspellchecker.core.config import SpellCheckerConfig, get_profile
class SpellCheckerConfig(BaseModel):
"""Spell checker configuration (Pydantic BaseModel)."""
# Core dependencies (runtime objects)
segmenter: Optional[Segmenter] = None
provider: Optional[DictionaryProvider] = None
# Suggestion settings
max_suggestions: int = 5
max_edit_distance: int = 2 # Range: 1-3
# Feature toggles
use_phonetic: bool = True
use_context_checker: bool = True
use_ner: bool = True
use_rule_based_validation: bool = True
# Word segmentation
word_engine: Literal["myword", "crf", "transformer"] = "myword"
seg_model: Optional[str] = None # Custom model for transformer engine
seg_device: int = -1 # -1=CPU, 0+=GPU (transformer only)
# Behavior
fallback_to_empty_provider: bool = False # Allow empty MemoryProvider if DB not found
# Nested configurations (each defaults to a new instance with its own defaults)
symspell: SymSpellConfig = SymSpellConfig()
ngram_context: NgramContextConfig = NgramContextConfig()
phonetic: PhoneticConfig = PhoneticConfig()
pos_tagger: POSTaggerConfig = POSTaggerConfig()
semantic: SemanticConfig = SemanticConfig()
validation: ValidationConfig = ValidationConfig()
provider_config: ProviderConfig = ProviderConfig()
joint: JointConfig = JointConfig()
cache: AlgorithmCacheConfig = AlgorithmCacheConfig()
ranker: RankerConfig = RankerConfig()
# Use get_profile() for presets:
config = get_profile("development") # Fast iteration, minimal validation
config = get_profile("production") # Balanced (default)
config = get_profile("testing") # Deterministic, reproducible
config = get_profile("fast") # Maximum speed
config = get_profile("accurate") # Maximum accuracy
ValidationLevel
Enum for validation depth.Copy
Ask AI
from myspellchecker.core.constants import ValidationLevel
class ValidationLevel(str, Enum):
SYLLABLE = "syllable" # Fast syllable-only validation
WORD = "word" # Thorough word + context validation
Note: Validation level is passed to check() and other methods, not as a configuration option.
POSTaggerConfig
POS tagger configuration.Copy
Ask AI
class POSTaggerConfig(BaseModel):
"""POS tagger configuration (pydantic model)."""
tagger_type: str = "rule_based" # "rule_based", "viterbi", "transformer"
model_name: str | None = None # HuggingFace model ID (for transformer)
device: int = -1 # -1 for CPU, 0+ for GPU
batch_size: int = 32
cache_size: int = 10000 # LRU cache size
use_morphology_fallback: bool = True
beam_width: int = 10 # For Viterbi tagger
unknown_tag: str = "UNK" # Tag for unknown words
SemanticConfig
Semantic checker configuration.Copy
Ask AI
class SemanticConfig(BaseModel):
"""Semantic checker configuration (Pydantic BaseModel)."""
model_path: str = None
tokenizer_path: str = None
model: Any = None # Pre-loaded ONNX session
tokenizer: Any = None # Pre-loaded tokenizer
num_threads: int = 1 # ONNX inference threads
predict_top_k: int = 5 # Top-K predictions
check_top_k: int = 10 # Tokens to check
use_semantic_refinement: bool = True
use_proactive_scanning: bool = False # AI-powered error detection
proactive_confidence_threshold: float = 0.5 # Threshold for proactive scanning
Response Classes
Response
Result of spell checking.Copy
Ask AI
from myspellchecker.core.response import Response
@dataclass
class Response:
"""Result of spell checking."""
text: str
"""Original input text (unchanged)."""
corrected_text: str
"""Auto-corrected text using top suggestions."""
has_errors: bool
"""True if any errors detected."""
level: str
"""Validation level used ('syllable' or 'word')."""
errors: list[Error]
"""List of Error objects (SyllableError, WordError, ContextError, GrammarError)."""
metadata: dict
"""Additional metadata (processing_time, layers_applied, etc.)."""
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
def to_json(self, indent: int = 2) -> str:
"""Convert to JSON string."""
Error
Base error class.Copy
Ask AI
from myspellchecker.core.response import Error, SyllableError, WordError, ContextError, GrammarError
from myspellchecker.core.constants import ErrorType
@dataclass
class Error:
"""Spelling error."""
text: str
"""The erroneous text (syllable or word)."""
position: int
"""Character position in original text (0-indexed)."""
suggestions: list[str]
"""Suggested corrections, ranked by likelihood."""
error_type: str
"""Type of error ('invalid_syllable', 'invalid_word', etc.)."""
confidence: float = 1.0
"""Confidence score (0.0-1.0). Higher = more certain."""
def to_dict(self) -> dict:
"""Convert to dictionary."""
def to_json(self, indent: int = 2) -> str:
"""Convert to JSON string."""
def get_localized_message(self, language: str = None) -> str:
"""Get a localized error message ('en' or 'my')."""
@dataclass
class SyllableError(Error):
"""Invalid syllable error (Layer 1). Default error_type: 'invalid_syllable'."""
error_type: str = "invalid_syllable"
@dataclass
class WordError(Error):
"""Invalid word error (Layer 2). Default error_type: 'invalid_word'."""
syllable_count: int = 0
error_type: str = "invalid_word"
@dataclass
class ContextError(Error):
"""Context error - unlikely word sequence (Layer 3). Default error_type: 'context_probability'."""
probability: float = 0.0
prev_word: str = ""
error_type: str = "context_probability"
@dataclass
class GrammarError(Error):
"""Grammar-related errors. Default error_type: 'grammar_error'."""
reason: str = ""
error_type: str = "grammar_error"
@property
def word(self) -> str:
"""Alias for 'text' for backward compatibility."""
@property
def suggestion(self) -> str:
"""Return first suggestion for backward compatibility."""
class ErrorType(str, Enum):
SYLLABLE = "invalid_syllable"
WORD = "invalid_word"
CONTEXT_PROBABILITY = "context_probability"
GRAMMAR = "grammar_error"
PARTICLE_TYPO = "particle_typo"
MEDIAL_CONFUSION = "medial_confusion"
COLLOQUIAL_VARIANT = "colloquial_variant"
COLLOQUIAL_INFO = "colloquial_info"
QUESTION_STRUCTURE = "question_structure"
SYNTAX_ERROR = "syntax_error"
HOMOPHONE_ERROR = "homophone_error"
TONE_AMBIGUITY = "tone_ambiguity"
POS_SEQUENCE_ERROR = "pos_sequence_error"
SEMANTIC_ERROR = "semantic_error"
ZAWGYI_ENCODING = "zawgyi_encoding"
MIXED_REGISTER = "mixed_register"
ASPECT_TYPO = "aspect_typo"
INVALID_SEQUENCE = "invalid_sequence"
INCOMPLETE_ASPECT = "incomplete_aspect"
TYPO = "typo"
AGREEMENT = "agreement"
COMPOUND_TYPO = "compound_typo"
INCOMPLETE_REDUPLICATION = "incomplete_reduplication"
CLASSIFIER_TYPO = "classifier_typo"
Provider Classes
DictionaryProvider
Abstract provider interface.Copy
Ask AI
from myspellchecker.providers.base import DictionaryProvider
class DictionaryProvider(ABC):
"""Dictionary data provider interface."""
# --- Core abstract methods (must be implemented) ---
def is_valid_syllable(self, syllable: str) -> bool:
"""Check if syllable exists."""
def is_valid_word(self, word: str) -> bool:
"""Check if word exists."""
def get_syllable_frequency(self, syllable: str) -> int:
"""Get syllable corpus frequency count."""
def get_word_frequency(self, word: str) -> int:
"""Get word corpus frequency count."""
def get_word_pos(self, word: str) -> str | None:
"""Get word POS tag (pipe-separated for multi-POS, e.g. 'N|V')."""
def get_bigram_probability(self, prev_word: str, current_word: str) -> float:
"""Get conditional probability P(current_word | prev_word)."""
def get_trigram_probability(self, w1: str, w2: str, w3: str) -> float:
"""Get conditional probability P(w3 | w1, w2)."""
def get_top_continuations(self, prev_word: str, limit: int = 20) -> list[tuple[str, float]]:
"""Get most likely words to follow prev_word, as (word, probability) tuples."""
def get_all_syllables(self) -> Iterator[tuple[str, int]]:
"""Get iterator over all (syllable, frequency) pairs. Used for SymSpell indexing."""
def get_all_words(self) -> Iterator[tuple[str, int]]:
"""Get iterator over all (word, frequency) pairs. Used for SymSpell indexing."""
def get_pos_unigram_probabilities(self) -> dict[str, float]:
"""Get all POS unigram probabilities."""
def get_pos_bigram_probabilities(self) -> dict[tuple[str, str], float]:
"""Get all POS bigram probabilities."""
def get_pos_trigram_probabilities(self) -> dict[tuple[str, str, str], float]:
"""Get all POS trigram probabilities."""
# --- Bulk operations (default implementations, override for optimization) ---
def is_valid_syllables_bulk(self, syllables: list[str]) -> dict[str, bool]:
"""Check validity of multiple syllables in a single operation."""
def is_valid_words_bulk(self, words: list[str]) -> dict[str, bool]:
"""Check validity of multiple words in a single operation."""
def get_syllable_frequencies_bulk(self, syllables: list[str]) -> dict[str, int]:
"""Get corpus frequencies for multiple syllables."""
def get_word_frequencies_bulk(self, words: list[str]) -> dict[str, int]:
"""Get corpus frequencies for multiple words."""
def get_word_pos_bulk(self, words: list[str]) -> dict[str, str | None]:
"""Get POS tags for multiple words."""
# --- Convenience methods ---
def has_syllable(self, syllable: str) -> bool:
"""Pure existence check for syllable (delegates to is_valid_syllable)."""
def has_word(self, word: str) -> bool:
"""Pure existence check for word (delegates to is_valid_word)."""
def __contains__(self, item: str) -> bool:
"""Support 'in' operator: checks syllables first, then words."""
# --- Factory method ---
@classmethod
def create(cls, provider_type: str = "sqlite", **kwargs) -> "DictionaryProvider":
"""Factory method to create provider instances ('sqlite', 'memory', 'json', 'csv')."""
def close(self) -> None:
"""Close and cleanup (optional, not all providers need this)."""
SQLiteProvider
SQLite-based provider.Copy
Ask AI
from myspellchecker.providers import SQLiteProvider
class SQLiteProvider(DictionaryProvider):
"""SQLite-based dictionary provider."""
def __init__(
self,
database_path: str | None = None,
cache_size: int = 8192,
check_same_thread: bool = False,
pos_tagger: POSTaggerBase = None,
pool_min_size: int = 1,
pool_max_size: int = 5,
pool_timeout: float = 5.0,
pool_max_connection_age: float = 3600.0,
sqlite_timeout: float = 30.0,
cache_manager: CacheManager = None,
):
"""
Initialize SQLite provider.
Args:
database_path: Database path (None for default)
cache_size: LRU cache size for frequency lookups (default: 8192)
check_same_thread: Allow sharing connection between threads (default: False)
pos_tagger: Optional POS tagger for OOV word tagging
pool_min_size: Minimum connections in pool
pool_max_size: Maximum connections in pool
pool_timeout: Connection checkout timeout in seconds
pool_max_connection_age: Max connection age before recreation (default: 3600.0)
sqlite_timeout: SQLite busy timeout in seconds
cache_manager: Optional CacheManager for dependency injection
"""
MemoryProvider
In-memory provider optimized for fast lookups.Copy
Ask AI
from myspellchecker.providers import MemoryProvider
class MemoryProvider(DictionaryProvider):
"""In-memory dictionary provider using Python dictionaries."""
def __init__(
self,
syllables: dict[str, int] = None,
words: dict[str, int] = None,
bigrams: dict[tuple[str, str], float] = None,
trigrams: dict[tuple[str, str, str], float] = None,
word_pos: dict[str, str] = None,
):
"""
Initialize MemoryProvider with optional pre-populated data.
Args:
syllables: Dictionary mapping syllable -> frequency count
words: Dictionary mapping word -> frequency count
bigrams: Dictionary mapping (prev_word, curr_word) -> probability
trigrams: Dictionary mapping (word1, word2, word3) -> probability
word_pos: Dictionary mapping word -> POS tag
"""
def add_syllable(self, syllable: str, frequency: int = 1) -> None:
"""Add a syllable with optional frequency."""
def add_word(self, word: str, frequency: int = 1) -> None:
"""Add a word with optional frequency."""
Algorithm Classes
SymSpell
Symmetric delete spell checking.Copy
Ask AI
from myspellchecker.algorithms.symspell import SymSpell, Suggestion
class SymSpell:
"""SymSpell algorithm for O(1) suggestions."""
def __init__(
self,
provider: DictionaryProvider,
max_edit_distance: int = 2,
prefix_length: int = 10,
count_threshold: int = 1,
):
"""
Initialize SymSpell with a dictionary provider.
Note: The class constructor default for count_threshold is 1,
but SymSpellConfig sets its default to 50. When constructed
via SpellCheckerConfig, the config value (50) takes precedence.
"""
def build_index(self, levels: list[str]) -> None:
"""Build delete index for specified levels ('syllable', 'word')."""
def lookup(
self,
term: str,
level: str = "syllable",
max_suggestions: int = 5,
include_known: bool = False,
use_phonetic: bool = False,
) -> list[Suggestion]:
"""
Look up suggestions for a term.
Returns:
List of Suggestion with term, edit_distance, frequency
"""
NgramContextChecker
N-gram based context checker.Copy
Ask AI
from myspellchecker.algorithms.ngram_context_checker import NgramContextChecker
class NgramContextChecker:
"""N-gram based context validation."""
def __init__(
self,
provider: DictionaryProvider,
threshold: float = 0.01,
trigram_threshold: float = 0.005,
right_context_threshold: float = None,
max_suggestions: int = 5,
edit_distance_weight: float = 0.3,
probability_weight: float = 0.7,
symspell: SymSpell = None,
candidate_limit: int = 50,
smoothing_strategy: SmoothingStrategy = SmoothingStrategy.STUPID_BACKOFF,
backoff_weight: float = 0.4,
add_k_smoothing: float = 0.0,
):
"""Initialize context checker."""
def get_smoothed_bigram_probability(self, word1: str, word2: str) -> float:
"""Get smoothed P(word2 | word1)."""
def get_smoothed_trigram_probability(self, word1: str, word2: str, word3: str) -> float:
"""Get smoothed P(word3 | word1, word2)."""
def is_contextual_error(
self,
prev_word: str,
current_word: str,
prev_prev_word: Optional[str] = None,
next_word: Optional[str] = None,
threshold: Optional[float] = None,
) -> bool:
"""Check if a word is a contextual error given surrounding context."""
def suggest(
self,
prev_word: str,
current_word: str,
max_edit_distance: int = 2,
next_word: Optional[str] = None,
) -> list[ContextSuggestion]:
"""Generate context-aware suggestions for a word."""
def analyze_sequence(
self,
words: list[str],
min_probability: Optional[float] = None,
) -> list[tuple[int, float, bool]]:
"""Analyze word sequence for contextual errors."""
SemanticChecker
Deep learning based context checker.Copy
Ask AI
from myspellchecker.algorithms.semantic_checker import SemanticChecker
class SemanticChecker:
"""ONNX-based semantic context checker."""
def __init__(
self,
model_path: str = None,
tokenizer_path: str = None,
model: Any = None,
tokenizer: Any = None,
num_threads: int = 1,
predict_top_k: int = 5,
check_top_k: int = 10,
use_pytorch: bool = False,
allow_extended_myanmar: bool = False,
):
"""Initialize semantic checker."""
def is_semantic_error(
self,
sentence: str,
word: str,
neighbors: list[str],
) -> Optional[str]:
"""Check if word is a semantic error using AI. Returns suggestion or None."""
def predict_mask(
self,
sentence: str,
target_word: str,
top_k: int = None,
occurrence: int = 0,
) -> list[tuple[str, float]]:
"""Predict most likely words for a masked position."""
Segmenter Classes
DefaultSegmenter
Default text segmenter.Copy
Ask AI
from myspellchecker.segmenters import DefaultSegmenter
class DefaultSegmenter(Segmenter):
"""Default Myanmar text segmenter using a hybrid approach."""
def __init__(
self,
word_engine: str = "myword",
allow_extended_myanmar: bool = False,
seg_model: Optional[str] = None,
seg_device: int = -1,
):
"""
Initialize segmenter.
Args:
word_engine: Word segmentation engine ("myword", "crf", or "transformer")
allow_extended_myanmar: Accept Extended Myanmar characters (U+1050-U+109F,
U+AA60-U+AA7F, U+A9E0-U+A9FF)
seg_model: Custom model name for transformer engine (optional)
seg_device: Device for transformer inference (-1=CPU, 0+=GPU)
"""
def segment_syllables(self, text: str) -> list[str]:
"""Segment text into syllables."""
def segment_words(self, text: str) -> list[str]:
"""Segment text into words."""
def segment_sentences(self, text: str) -> list[str]:
"""Segment text into sentences using heuristics."""
def load_custom_dictionary(self, words: list[str]) -> None:
"""Load custom dictionary words (myword engine only)."""
Utility Functions
Text Normalization
Copy
Ask AI
from myspellchecker.text.normalize import (
normalize,
normalize_for_lookup,
)
def normalize(
text: str,
form: Literal["NFC", "NFD", "NFKC", "NFKD"] = "NFC",
remove_zero_width: bool = True,
reorder_diacritics: bool = True,
normalize_variants: bool = False,
normalize_tall_aa: bool = True,
normalize_u_asat: bool = True,
) -> str:
"""
Normalize Myanmar text with configurable steps.
Args:
text: Input Myanmar text
form: Unicode normalization form
remove_zero_width: Remove zero-width characters
reorder_diacritics: Apply Myanmar-specific diacritic reordering (UTN #11)
normalize_variants: Map character variants to canonical forms
normalize_tall_aa: Correct Tall AA after Medial Wa (default: True)
normalize_u_asat: Convert independent vowel U + asat to consonant form (default: True)
"""
def normalize_for_lookup(
text: str,
convert_zawgyi: bool = True,
config: Optional[ZawgyiConfig] = None,
) -> str:
"""Unified normalization for all dictionary/index lookups (includes Zawgyi conversion)."""
# For direct Cython function access (requires compiled extensions):
from myspellchecker.text.normalize_c import (
remove_zero_width_chars,
reorder_myanmar_diacritics,
get_myanmar_ratio,
)
# For higher-level normalization with presets:
from myspellchecker.text.normalization_service import (
NormalizationService,
normalize_for_spell_checking,
normalize_for_lookup,
normalize_for_comparison,
)
Logging Configuration
Copy
Ask AI
from myspellchecker.utils.logging_utils import configure_logging
def configure_logging(
level: Union[int, str] = logging.INFO,
format_string: str = None,
stream: TextIO = None,
json_output: bool = False,
debug_mode: bool = False,
) -> None:
"""Configure logging for the library."""
Exceptions
Copy
Ask AI
from myspellchecker.core.exceptions import (
MyanmarSpellcheckError,
ConfigurationError,
InvalidConfigError,
DataLoadingError,
MissingDatabaseError,
ProcessingError,
ValidationError,
TokenizationError,
NormalizationError,
ProviderError,
ConnectionPoolError,
PipelineError,
IngestionError,
PackagingError,
ModelError,
ModelLoadError,
InferenceError,
MissingDependencyError,
InsufficientStorageError,
CacheError,
)
Copy
Ask AI
MyanmarSpellcheckError (base)
├── ConfigurationError
│ └── InvalidConfigError
├── DataLoadingError
│ └── MissingDatabaseError
├── ProcessingError
│ ├── ValidationError
│ ├── TokenizationError
│ └── NormalizationError
├── ProviderError
│ └── ConnectionPoolError
├── PipelineError
│ ├── IngestionError
│ └── PackagingError
├── ModelError
│ ├── ModelLoadError
│ └── InferenceError
├── MissingDependencyError
├── InsufficientStorageError
└── CacheError
Copy
Ask AI
class MyanmarSpellcheckError(Exception):
"""Base exception for all spell checker errors."""
class ConfigurationError(MyanmarSpellcheckError):
"""Configuration-related errors."""
class InvalidConfigError(ConfigurationError):
"""Specific configuration value is invalid."""
class DataLoadingError(MyanmarSpellcheckError):
"""Data loading errors."""
class MissingDatabaseError(DataLoadingError):
"""Spell checker database not found. Includes searched_paths and suggestion attributes."""
class ProcessingError(MyanmarSpellcheckError):
"""Text processing errors (base for validation/tokenization/normalization)."""
class ValidationError(ProcessingError):
"""Validation processing errors."""
class TokenizationError(ProcessingError):
"""Text tokenization/segmentation errors."""
class NormalizationError(ProcessingError):
"""Text normalization errors."""
class ProviderError(MyanmarSpellcheckError):
"""Provider-related errors."""
class ConnectionPoolError(ProviderError):
"""Connection pool errors (exhaustion, creation failures)."""
class PipelineError(MyanmarSpellcheckError):
"""Data pipeline errors."""
class IngestionError(PipelineError):
"""Corpus ingestion errors. Has failed_files and missing_files attributes."""
class PackagingError(PipelineError):
"""Database packaging errors."""
class ModelError(MyanmarSpellcheckError):
"""Machine learning model errors."""
class ModelLoadError(ModelError):
"""Model loading failures."""
class InferenceError(ModelError):
"""Model inference failures."""
class MissingDependencyError(MyanmarSpellcheckError):
"""Required external dependency is missing."""
class InsufficientStorageError(MyanmarSpellcheckError):
"""Not enough disk space for operation."""
class CacheError(MyanmarSpellcheckError):
"""Caching operation failures."""
Module Index
| Module | Description | Documentation |
|---|---|---|
myspellchecker | Main package exports | This page |
myspellchecker.core | Core classes and config | This page |
myspellchecker.algorithms | Spell check algorithms | Algorithms |
myspellchecker.providers | Dictionary providers | Provider Capabilities |
myspellchecker.segmenters | Text segmenters | This page |
myspellchecker.tokenizers | Low-level tokenizers | Tokenizers API |
myspellchecker.utils | Utility functions | This page |
myspellchecker.data_pipeline | Dictionary building | Data Pipeline |
myspellchecker.training | Model training | Training |
Next Steps
- Getting Started - Quick start guide
- Configuration - Configuration options
- CLI Reference - Command-line interface