Skip to main content
This page catalogs every character set, validation set, enum, and default value defined in the constants modules — from the 33 base consonants through medial compatibility tables to POS tag constants. Source: src/myspellchecker/core/constants/myanmar_constants.py and src/myspellchecker/core/constants/core_constants.py

Unicode Ranges

Main Myanmar Block

# Tuple: (start, end) inclusive boundaries
MYANMAR_RANGE = (0x1000, 0x109F)

Extended Blocks

# Myanmar Extended-A: U+AA60 to U+AA7F (tuple)
MYANMAR_EXTENDED_A_RANGE = (0xAA60, 0xAA7F)

# Myanmar Extended-B: U+A9E0 to U+A9FF (tuple)
MYANMAR_EXTENDED_B_RANGE = (0xA9E0, 0xA9FF)

# Regex pattern for all Myanmar ranges
MYANMAR_RANGE_REGEX_STR = r"[\u1000-\u109F\uA9E0-\uA9FF\uAA60-\uAA7F]"

Extended-A Character Ranges (U+AA60–U+AA7F)

RangeUsage
U+AA60–U+AA6FShan consonants
U+AA70–U+AA76Shan vowels and tones
U+AA77–U+AA79Shan symbols

Extended-B Character Ranges (U+A9E0–U+A9FF)

RangeUsage
U+A9E0–U+A9E4Shan letters
U+A9E5Shan sign
U+A9E6Reduplication mark

Character Set Constants

# Characters in Extended-A block (set)
MYANMAR_EXTENDED_A_CHARS = set(chr(c) for c in range(0xAA60, 0xAA80))

# Characters in Extended-B block (set)
MYANMAR_EXTENDED_B_CHARS = set(chr(c) for c in range(0xA9E0, 0xAA00))

# All Myanmar characters combined (set)
ALL_MYANMAR_CHARS = (
    set(chr(c) for c in range(0x1000, 0x10A0))
    | MYANMAR_EXTENDED_A_CHARS
    | MYANMAR_EXTENDED_B_CHARS
)

# Core Burmese-only characters (U+1000-U+104F minus non-standard chars)
MYANMAR_CORE_CHARS: Set[str]

# Extended Core Block (U+1050-U+109F) - Shan/Mon/Karen additions
MYANMAR_EXTENDED_CORE_BLOCK: Set[str]

# All extended blocks combined (out of scope for v1.0)
EXTENDED_MYANMAR_CHARS: Set[str]

Helper Functions

def get_myanmar_char_set(allow_extended: bool = False) -> Set[str]:
    """Get Myanmar character set based on scope."""

def has_extended_myanmar_chars(text: str) -> bool:
    """Check if text contains Extended Myanmar characters."""

def is_myanmar_text(text: str, allow_extended: bool = False) -> bool:
    """Check if text contains any Myanmar characters."""

Character Sets

Consonants

# 33 base consonants (U+1000 to U+1020) plus Great Sa (U+103F)
# Type: set (mutable -- Great Sa is added after initial construction)
CONSONANTS = set(chr(i) for i in range(0x1000, 0x1021))
CONSONANTS.add(GREAT_SA)  # U+103F added separately
CharCode PointNameRomanization
ကU+1000KAka
U+1001KHAkha
U+1002GAga
U+1003GHAgha
U+1004NGAnga
U+1005CAsa
U+1006CHAhsa
U+1007JAza
U+1008JHAzha
U+1009NYA (archaic)nya
U+100ANYAnya
U+100BTTAtta
U+100CTTHAttha
U+100DDDAdda
U+100EDDHAddha
U+100FNNAnna
U+1010TAta
U+1011THAhta
U+1012DAda
U+1013DHAdha
U+1014NAna
U+1015PApa
U+1016PHApha
U+1017BAba
U+1018BHAbha
U+1019MAma
U+101AYAya
U+101BRAra
U+101CLAla
U+101DWAwa
U+101ESAtha
U+101FHAha
U+1020LLAlla
U+103FGREAT SA (added separately)ssa

Non-Standard Characters

# Mon/Shan specific chars in Core Block -- not used in standard Burmese
NON_STANDARD_CHARS = {
    "\u1022",  # SHAN LETTER A
    "\u1028",  # MYANMAR LETTER MON E
    "\u1033",  # MYANMAR VOWEL SIGN MON II
    "\u1034",  # MYANMAR VOWEL SIGN MON O
    "\u1035",  # MYANMAR VOWEL SIGN E ABOVE
}

Independent Vowels

# Stand-alone vowels (U+1021 to U+102A)
# Type: set
# Full set includes vowel carrier (U+1021) and Shan letter (U+1022)
INDEPENDENT_VOWELS = set(chr(i) for i in range(0x1021, 0x102B))

# Strict set: standard Burmese only (excludes carrier and non-standard)
INDEPENDENT_VOWELS_STRICT = {
    "\u1023",  # I
    "\u1024",  # II
    "\u1025",  # U
    "\u1026",  # UU
    "\u1027",  # E
    "\u1029",  # O
    "\u102a",  # AU
}

# Vowel carrier -- behaves like a consonant in syllable structure
VOWEL_CARRIER = "\u1021"  # (U+1021)
Note: Use INDEPENDENT_VOWELS_STRICT for standard Burmese-only validation.

Vowel Signs (Dependent Vowels)

# Vowel signs attached to consonants (U+102B to U+1032)
# Type: set
VOWEL_SIGNS = set(chr(i) for i in range(0x102B, 0x1033))
CharCode PointName
U+102BTALL AA
U+102CAA
U+102DI
U+102EII
U+102FU
U+1030UU
U+1031E (left-side, stored after consonant)
U+1032AI

Vowel Classification

# Position-based vowel subsets (used in validation)
UPPER_VOWELS = {"\u102d", "\u102e", "\u1032"}  # I, II, AI
LOWER_VOWELS = {"\u102f", "\u1030"}             # U, UU

# Invalid E-vowel combinations
INVALID_E_COMBINATIONS = {"\u102d", "\u102e", "\u102f", "\u1030"}

# Valid Vowel Combinations (Digraphs)
VALID_VOWEL_COMBINATIONS = {
    frozenset({"\u1031", "\u102c"}),  # E + Aa
    frozenset({"\u1031", "\u102b"}),  # E + Tall A
    frozenset({"\u102d", "\u102f"}),  # I + U
}

# Anusvara (U+1036) Compatibility
ANUSVARA_ALLOWED_VOWELS = {"\u102d", "\u102f"}  # I, U

Medials

# Consonant modifiers (U+103B to U+103E)
# Type: set
MEDIALS = {"\u103b", "\u103c", "\u103d", "\u103e"}

# Individual medial constants
MEDIAL_YA = "\u103b"      # Ya-yit (ျ) - subscript ya
MEDIAL_RA = "\u103c"      # Ya-pin (ြ) - left-side ra
MEDIAL_WA = "\u103d"      # Wa-hswe (ွ)
MEDIAL_HA = "\u103e"      # Ha-htoe (ှ)

# Phonetic aliases
MEDIAL_YA_YIT = MEDIAL_YA  # ျ - /j/ glide
MEDIAL_YA_PIN = MEDIAL_RA  # ြ - /r/ glide

Signs and Marks

# Tone marks
# Type: set -- includes all three tone/nasal markers
TONE_MARKS = {
    "\u1036",  # Anusvara (Thay-thay-tin) - nasalization
    "\u1037",  # Dot Below (Auk-myit) - creaky tone
    "\u1038",  # Visarga (Wit-sa-pauk) - emphatic/final
}

# Special signs (individual constants)
ANUSVARA = "\u1036"   # U+1036 - Nasal mark
DOT_BELOW = "\u1037"  # U+1037 - Creaky tone
VISARGA = "\u1038"     # U+1038 - Emphatic/final
VIRAMA = "\u1039"      # U+1039 - Stacker (pat sint)
ASAT = "\u103a"        # U+103A - Vowel killer
NGA = "\u1004"         # U+1004 - Nga consonant
GREAT_SA = "\u103f"    # U+103F - Great Sa

# Specific vowel combinations
VOWEL_E = "\u1031"     # E vowel (pre-consonant)
VOWEL_AI = "\u1032"    # AI vowel

# English token placeholder
ENG_TOKEN = "<ENG>"

# Dependent various signs (U+1032 - U+103E)
DEPENDENT_VARIOUS_SIGNS = set(chr(i) for i in range(0x1032, 0x103F))

Tone Mark Rules

# Invalid tone mark combinations
INVALID_TONE_COMBINATIONS = {
    frozenset({"\u1036", "\u103a"}),  # Anusvara + Asat
    frozenset({"\u1037", "\u1038"}),  # Dot Below + Visarga
}

# Tone marks incompatible with Asat
TONE_MARKS_INCOMPATIBLE_WITH_ASAT = {"\u1036"}  # Anusvara

# Vowels incompatible with Anusvara (currently empty)
VOWELS_INCOMPATIBLE_WITH_ANUSVARA: Set[str] = set()

Myanmar Numerals

# Myanmar digits (U+1040 to U+1049)
# Type: set
MYANMAR_NUMERALS = {
    "\u1040",  # 0
    "\u1041",  # 1
    "\u1042",  # 2
    "\u1043",  # 3
    "\u1044",  # 4
    "\u1045",  # 5
    "\u1046",  # 6
    "\u1047",  # 7
    "\u1048",  # 8
    "\u1049",  # 9
}

# Myanmar numeral words (written form, Dict[str, int])
MYANMAR_NUMERAL_WORDS = {
    "တစ်": 1,
    "နှစ်": 2,
    "သုံး": 3,
    ...
    "သန်း": 1000000,
}

Punctuation

# Myanmar punctuation (U+104A to U+104F)
# Type: set (initial definition), later redefined as frozenset for MYANMAR_PUNCTUATION
MYANMAR_PUNCTUATION = set(chr(i) for i in range(0x104A, 0x1050))

# Common punctuation (mixed Myanmar and ASCII)
COMMON_PUNCTUATION = set(...)

# Myanmar-specific separators
SENTENCE_SEPARATOR = "။"  # Myanmar full stop (U+104B)
PHRASE_SEPARATOR = "၊"    # Myanmar comma (U+104A)

Section Marks and Logographic Particles

# Section marks (frozenset)
SECTION_MARKS = frozenset({"\u104c", "\u104d"})  # ၌ ၍

# Reference marks (frozenset)
REFERENCE_MARKS = frozenset({"\u104e", "\u104f"})  # ၎ ၏

# All logographic particles combined (frozenset)
LOGOGRAPHIC_PARTICLES = SECTION_MARKS | REFERENCE_MARKS

# Valid particles (frozenset)
VALID_PARTICLES = frozenset(["\u104c", "\u104d", "\u104e", "\u104f"])

# Myanmar special symbols (frozenset)
MYANMAR_SPECIAL_SYMBOLS = LOGOGRAPHIC_PARTICLES | MYANMAR_PUNCTUATION

Validation Sets

Valid Medial Sequences

# All valid medial combinations in canonical order: Ya > Ra > Wa > Ha
# Type: set of strings
VALID_MEDIAL_SEQUENCES = {
    # Four-medial
    "ျြွှ",
    # Three-medial
    "ျြွ", "ျြှ", "ျွှ", "ြွှ",
    # Two-medial
    "ျြ", "ျွ", "ျှ", "ြွ", "ြှ", "ွှ",
    # Single medials
    "ျ", "ြ", "ွ", "ှ",
}

Normalization Order Weights

# UTN #11 canonical order for medials and vowels
# Type: dict (character -> numeric weight)
ORDER_WEIGHTS = {
    "\u103b": 10,    # Medial Ya
    "\u103c": 11,    # Medial Ra
    "\u103d": 12,    # Medial Wa
    "\u103e": 13,    # Medial Ha
    "\u1031": 20,    # Vowel E
    "\u102d": 21,    # Vowel I (Upper)
    "\u102e": 21,    # Vowel II (Upper)
    "\u1032": 21,    # Vowel AI (Upper)
    "\u102f": 22,    # Vowel U (Lower)
    "\u1030": 22,    # Vowel UU (Lower)
    "\u102b": 21.4,  # Vowel A (Tall)
    "\u102c": 21.4,  # Vowel AA
    "\u1036": 30,    # Anusvara
    "\u103a": 21.5,  # Asat
    "\u1037": 32,    # Dot Below
    "\u1038": 33,    # Visarga
    "\u1039": 40,    # Virama
}

Zero-Width Characters

# Characters to remove during normalization
# Type: set
ZERO_WIDTH_CHARS = {
    "\u200b",  # ZERO WIDTH SPACE
    "\u200c",  # ZERO WIDTH NON-JOINER
    "\u200d",  # ZERO WIDTH JOINER
    "\ufeff",  # ZERO WIDTH NO-BREAK SPACE (BOM)
}

Medial Compatibility Sets

Defines which consonants can validly combine with each medial.
# Consonants that can take Medial Ya-yit (ျ U+103B)
COMPATIBLE_YA: set     # Also aliased as COMPATIBLE_YA_YIT

# Consonants that can take Medial Ya-pin (ြ U+103C)
COMPATIBLE_RA: set     # Also aliased as COMPATIBLE_YA_PIN

# Consonants that can take Medial Wa (ွ U+103D)
COMPATIBLE_WA: set     # Broadest compatibility

# Consonants that can take Medial Ha (ှ U+103E)
COMPATIBLE_HA: set     # Sonorant consonants only

Phonetic Character Sets

# Sonorant consonants (nasals, liquids, glides)
SONORANTS: set

# Stop/obstruent consonants for syllable-final position
STOP_FINALS: set

Stacking and Kinzi

# Valid consonant stacking pairs for Pali/Sanskrit loanwords
# Type: set of (upper_consonant, lower_consonant) tuples
STACKING_EXCEPTIONS: set

# Kinzi valid followers (consonants that can follow Kinzi pattern)
# Type: set
KINZI_VALID_FOLLOWERS: set

Part-of-Speech Tag Constants

Granular particle tag constants defined in core_constants.py:
P_SUBJ = "P_SUBJ"      # Subject/topic marker
P_OBJ = "P_OBJ"        # Object marker
P_SENT = "P_SENT"      # Sentence ending particle
P_MOD = "P_MOD"        # Modifier particle
P_LOC = "P_LOC"        # Location/direction marker
P_GENERAL = "P"         # General particle tag

Skipped Context Words

High-frequency particles skipped by the Context Validator:
# Type: set
SKIPPED_CONTEXT_WORDS = {
    "ကွာ", "ဗျာ", "နော်", "ဟေ့", ...  # Interjections
    "က", "ကို", "သည်", "တယ်", ...      # Subject/object markers
    "မှာ", "မှ", "တွင်", ...              # Locative particles
    "နဲ့", "နှင့်", ...                    # Comitative/conjunctive
    "ရဲ့", "၏", ...                        # Genitive/possessive
    "များ", "လည်း", ...                   # Other common particles
    "လား", "လဲ",                          # Question particles
}

Morphology Constants

Suffix sets for OOV POS guessing (from core_constants.py):
# Type: frozenset[str]
VERB_SUFFIXES: frozenset[str]     # e.g., "ပြီ", "ပြီး", "ခဲ့", ...
NOUN_SUFFIXES: frozenset[str]     # e.g., "များ", "တွေ", "ခြင်း", ...
ADVERB_SUFFIXES: frozenset[str]   # e.g., "စွာ", "တိုင်း", ...

Enums

class ValidationLevel(str, Enum):
    SYLLABLE = "syllable"  # Fast, only checks valid syllables
    WORD = "word"          # Comprehensive, checks words and context

class ErrorType(str, Enum):
    SYLLABLE = "invalid_syllable"
    WORD = "invalid_word"
    CONTEXT_PROBABILITY = "context_probability"
    GRAMMAR = "grammar_error"
    # ... and more specific error types

Algorithm Defaults

class AlgorithmDefaults:
    MAX_EDIT_DISTANCE: int = 2
    PREFIX_LENGTH: int = 10
    COUNT_THRESHOLD: int = 1
    MAX_SUGGESTIONS: int = 5
    BEAM_WIDTH_DEFAULT: int = 50
    NGRAM_THRESHOLD: float = 0.01
    # ... see core_constants.py for full list

Usage in Code

Importing Constants

from myspellchecker.core.constants import (
    CONSONANTS,
    MEDIALS,
    VOWEL_SIGNS,
    ORDER_WEIGHTS,
    ZERO_WIDTH_CHARS,
    MYANMAR_RANGE,
    MYANMAR_NUMERALS,
    TONE_MARKS,
    VALID_MEDIAL_SEQUENCES,
    SKIPPED_CONTEXT_WORDS,
    ValidationLevel,
    ErrorType,
)

Character Classification

def classify_myanmar_char(char: str) -> str:
    """Classify a Myanmar character by type."""
    if char in CONSONANTS:
        return 'consonant'
    elif char in MEDIALS:
        return 'medial'
    elif char in VOWEL_SIGNS:
        return 'vowel_sign'
    elif char in INDEPENDENT_VOWELS:
        return 'independent_vowel'
    elif char in TONE_MARKS:
        return 'tone_mark'
    elif char in MYANMAR_NUMERALS:
        return 'numeral'
    elif char in MYANMAR_PUNCTUATION:
        return 'punctuation'
    else:
        return 'unknown'

Validation Helper

def validate_syllable_structure(syllable: str) -> tuple[bool, str]:
    """Validate Myanmar syllable structure."""
    if not syllable:
        return False, "Empty syllable"

    if syllable[0] not in CONSONANTS:
        return False, "Must start with consonant"

    # Check medial order using ORDER_WEIGHTS
    medials = [c for c in syllable if c in MEDIALS]
    if medials:
        weights = [ORDER_WEIGHTS[m] for m in medials]
        if weights != sorted(weights):
            return False, "Invalid medial order"
        if len(medials) != len(set(medials)):
            return False, "Duplicate medials"

    # Check medial sequence is valid
    medial_str = ''.join(medials)
    if medial_str and medial_str not in VALID_MEDIAL_SEQUENCES:
        return False, "Invalid medial combination"

    return True, "Valid"

Canonical Character Ordering (UTN #11)

Correct canonical order for Myanmar syllable characters (Unicode storage order):
1. Consonant (required)
2. Virama (္) + stacked consonant (if stacking)
3. Medial YA (ျ) - slot 3
4. Medial RA (ြ) - slot 4
5. Medial WA (ွ) - slot 5
6. Medial HA (ှ) - slot 6
7. Vowel E (ေ) - visually left but stored here
8. Upper vowels (ိ, ီ, ဲ)
9. Tall A (ါ) or AA (ာ)
10. Asat (်) - when forming final consonant
11. Lower vowels (ု, ူ)
12. Anusvara (ံ)
13. Tone marks: Dot Below (့), Visarga (း)
14. Final Virama (္) - rare, for special stacking
Note on Asat position: The Asat (်) appears in position 10 when it forms a final consonant cluster (e.g., မြန်), interleaving with vowels. This is distinct from its use in Kinzi patterns where it appears earlier.

See Also