Source code for sanskrit_parser.parser.sandhi_analyzer

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Intro
=====

Sandhi Analyzer for Sanskrit words

@author: Karthik Madathil (github: @kmadathil)

Usage
=====

Use the ``LexicalSandhiAnalyzer`` to split a sentence (wrapped in a
``SanskritObject``) and retrieve the top 10 splits:

.. code:: python

    >>> from __future__ import print_function
    >>> from sanskrit_parser.parser.sandhi_analyzer import LexicalSandhiAnalyzer
    >>> from sanskrit_parser.base.sanskrit_base import SanskritObject, SLP1
    >>> sentence = SanskritObject("astyuttarasyAMdishidevatAtmA")
    >>> analyzer = LexicalSandhiAnalyzer()
    >>> splits = analyzer.getSandhiSplits(sentence).findAllPaths(10)
    >>> for split in splits:
    ...    print(split)
    ...
    [u'asti', u'uttarasyAm', u'diSi', u'devatA', u'AtmA']
    [u'asti', u'uttarasyAm', u'diSi', u'devat', u'AtmA']
    [u'asti', u'uttarasyAm', u'diSi', u'devata', u'AtmA']
    [u'asti', u'uttara', u'syAm', u'diSi', u'devatA', u'AtmA']
    [u'asti', u'uttarasyAm', u'diSi', u'devatA', u'at', u'mA']
    [u'asti', u'uttarasyAm', u'diSi', u'de', u'vatA', u'AtmA']
    [u'asti', u'uttarasyAm', u'diSi', u'devata', u'at', u'mA']
    [u'asti', u'uttas', u'rasyAm', u'diSi', u'devat', u'AtmA']
    [u'asti', u'uttara', u'syAm', u'diSi', u'devat', u'AtmA']
    [u'asti', u'uttarasyAm', u'diSi', u'de', u'avatA', u'AtmA']

The sandhi_analyzer can also be used to look up the tags for a given
word form: (Note that the database stores words
ending in visarga with an 's' at the end)

.. code:: python

    >>> word = SanskritObject('hares')
    >>> tags = analyzer.getMorphologicalTags(word)
    >>> for tag in tags:
    ...    print(tag)
    ...
    ('hf#1', set(['cj', 'snd', 'prim', 'para', 'md', 'sys', 'prs', 'v', 'np', 'sg', 'op']))
    ('hari#1', set(['na', 'mas', 'sg', 'gen']))
    ('hari#1', set(['na', 'mas', 'abl', 'sg']))
    ('hari#1', set(['na', 'fem', 'sg', 'gen']))
    ('hari#1', set(['na', 'fem', 'abl', 'sg']))
    ('hari#2', set(['na', 'mas', 'sg', 'gen']))
    ('hari#2', set(['na', 'mas', 'abl', 'sg']))
    ('hari#2', set(['na', 'fem', 'sg', 'gen']))
    ('hari#2', set(['na', 'fem', 'abl', 'sg']))



"""

from __future__ import print_function
from sanskrit_parser.util.lexical_lookup_factory import LexicalLookupFactory
import sanskrit_parser.base.sanskrit_base as SanskritBase

from .sandhi import Sandhi
import logging
from .datastructures import SandhiGraph
from argparse import ArgumentParser


try:
    from functools import lru_cache
except ImportError:
    from backports.functools_lru_cache import lru_cache

logger = logging.getLogger(__name__)


[docs]class LexicalSandhiAnalyzer(object): """ Singleton class to hold methods for Sanskrit lexical sandhi analysis. We define lexical sandhi analysis to be the process of taking an input sequence and transforming it to a collection (represented by a DAG) of potential sandhi splits of the sequence. Each member of a split is guaranteed to be a valid lexical form. """ sandhi = Sandhi() # Singleton! def __init__(self, lexical_lookup="combined"): forms = LexicalLookupFactory.create(lexical_lookup) self.forms = forms pass
[docs] def getMorphologicalTags(self, obj, tmap=True): """ Get Morphological tags for a word Params: obj(SanskritString): word tmap(Boolean=True): If True, maps tags to our format Returns list: List of (base, tagset) pairs """ ot = obj.canonical() tags = self.forms.get_tags(ot, tmap) return tags
[docs] def hasTag(self, obj, name, tagset): """ Check if word matches morhphological tags Params: obj(SanskritString): word name(str): name in tag tagset(set): set of tag elements Returns list: List of (base, tagset) pairs for obj that match (name,tagset), or None """ morphological_tags = self.getMorphologicalTags(obj) if morphological_tags is None: return None assert (name is not None) or (tagset is not None) r = [] for li in morphological_tags: # Name is none, or name matches # Tagset is None, or all its elements are found in tagset if ((name is None) or name.canonical() == li[0]) and \ ((tagset is None) or tagset.issubset(li[1])): r.append(li) if r == []: return None else: return r
[docs] def tagSandhiGraph(self, g): ''' Tag a Sandhi Graph with morphological tags for each node Params: g (SandhiGraph) : input lexical sandhi graph ''' for n in g: # Avoid start and end if isinstance(n, SanskritBase.SanskritObject): t = self.getMorphologicalTags(n) logger.debug("Got tags %s for %s", t, n) n.setMorphologicalTags(t)
[docs] def preSegmented(self, sl, tag=False): ''' Get a SandhiGraph for a pre-segmented sentence Params: sl (list of SanskritString): Input object tag(Boolean) : When True (def=False), return a morphologically tagged graph Returns: SandhiGraph : DAG all possible splits ''' self.sentence = SandhiGraph() prev = None for s in sl[::-1]: self.sentence.add_node(s) if prev is None: self.sentence.add_end_edge(s) else: self.sentence.append_to_node(s, [prev]) prev = s self.sentence.add_roots([prev]) if tag: self.tagSandhiGraph(self.sentence) self.sentence.lock_start() return self.sentence
[docs] def getSandhiSplits(self, o, tag=False, pre_segmented=False): ''' Get all valid Sandhi splits for a string Params: o(SanskritString): Input object tag(Boolean) : When True (def=False), return a morphologically tagged graph Returns: SandhiGraph : DAG all possible splits ''' if pre_segmented: return self.preSegmented(o, tag) self.dynamic_scoreboard = {} # Transform to internal canonical form s = o.canonical() # Initialize an empty graph to hold the splits self.splits = SandhiGraph() # _possible_splits updates graph in self.splits with nodes and returns roots roots = self._possible_splits(s) if tag and len(roots) > 0: self.tagSandhiGraph(self.splits) if len(roots) == 0: return None else: self.splits.add_roots(roots) return self.splits
def _possible_splits(self, s): ''' private method to dynamically compute all sandhi splits Used by getSandhiSplits Adds the individual splits to the graph self.splits and returns the roots of the subgraph corresponding to the split of s Params: s(string): Input SLP1 encoded string Returns: roots : set of roots of subgraph corresponding to possible splits of s ''' logger.debug("Splitting " + s) @lru_cache(256) def _is_valid_word(ss): r = self.forms.valid(ss) return r def _sandhi_splits_all(s, start=None, stop=None): obj = SanskritBase.SanskritImmutableString(s, encoding=SanskritBase.SLP1) splits = self.sandhi.split_all(obj, start, stop) return splits roots = set() # Memoization for dynamic programming - remember substrings that've # been seen before if s in self.dynamic_scoreboard: logger.debug("Found {} in scoreboard".format(s)) return self.dynamic_scoreboard[s] # If a space is found in a string, stop at that space spos = s.find(" ") stop = None if spos == -1 else spos s_c_list = _sandhi_splits_all(s, start=0, stop=stop) logger.debug("s_c_list: " + str(s_c_list)) if s_c_list is None: s_c_list = [] node_cache = {} for (s_c_left, s_c_right) in s_c_list: # Is the left side a valid word? if _is_valid_word(s_c_left): logger.debug("Valid left word: " + s_c_left) # For each split with a valid left part, check it there are # valid splits of the right part if s_c_right and s_c_right != '': logger.debug("Trying to split:" + s_c_right) r_roots = self._possible_splits(s_c_right.strip()) # if there are valid splits of the right side if r_roots: # Make sure we got a set of roots back assert isinstance(r_roots, set) # if there are valid splits of the right side if s_c_left not in node_cache: # Extend splits list with s_c_left appended with # possible splits of s_c_right t = SanskritBase.SanskritObject(s_c_left, encoding=SanskritBase.SLP1) node_cache[s_c_left] = t else: t = node_cache[s_c_left] roots.add(t) if not self.splits.has_node(t): self.splits.add_node(t) self.splits.append_to_node(t, r_roots) else: # Null right part # Why cache s_c_left here? To handle the case # where the same s_c_left appears with a null and non-null # right side. if s_c_left not in node_cache: t = SanskritBase.SanskritObject(s_c_left, encoding=SanskritBase.SLP1) node_cache[s_c_left] = t else: t = node_cache[s_c_left] # Extend splits list with s_c_left appended with # possible splits of s_c_right roots.add(t) if not self.splits.has_node(t): self.splits.add_node(t) self.splits.add_end_edge(t) else: logger.debug("Invalid left word: " + s_c_left) # Update scoreboard for this substring, so we don't have to split # again self.dynamic_scoreboard[s] = roots if len(roots) == 0: logger.debug("No splits found, returning empty set") else: logger.debug("Roots: %s", roots) return roots
[docs]def getArgs(argv=None): """ Argparse routine. Returns args variable """ # Parser Setup parser = ArgumentParser(description='Lexical Sandhi Analyzer') # String to encode parser.add_argument('data', nargs="?", type=str, default="adhi") # Input Encoding (autodetect by default) parser.add_argument('--input-encoding', type=str, default=None) # Filter by base name parser.add_argument('--base', type=str, default=None) # Filter by tag set parser.add_argument('--tag-set', type=str, default=None, nargs="+") parser.add_argument('--tags', dest='split', action='store_false') parser.add_argument('--max-paths', type=int, default=10) parser.add_argument('--lexical-lookup', type=str, default="combined") parser.add_argument('--strict-io', action='store_true', help="Do not modify the input/output string to match conventions", default=False) parser.add_argument('--no-score', dest="score", action='store_false', help="Don't use the lexical scorer to score the splits and reorder them") parser.add_argument('--no-map-tags', dest='map_tags', action='store_false', help="show raw (unmapped to our standard set) tags") return parser.parse_args(argv)
[docs]def main(argv=None): graph = None args = getArgs(argv) if args.strict_io: print("Interpreting input strictly") else: print("Interpreting input loosely (strict_io set to false)") print("Input String:", args.data) s = LexicalSandhiAnalyzer(args.lexical_lookup) if args.input_encoding is None: ie = None else: ie = SanskritBase.SCHEMES[args.input_encoding] with SanskritBase.outputctx(args.strict_io): if not args.split: i = SanskritBase.SanskritNormalizedString(args.data, encoding=ie, strict_io=args.strict_io, replace_ending_visarga='s') print("Input String in SLP1:", i.canonical()) ts = s.getMorphologicalTags(i, tmap=args.map_tags) print("Morphological tags:") if ts is not None: for t in ts: print(t) # Possible rakaranta # Try by replacing end visarga with 'r' instead elif not args.strict_io: i = SanskritBase.SanskritNormalizedString(args.data, encoding=ie, strict_io=args.strict_io, replace_ending_visarga='r') ts = s.getMorphologicalTags(i) if ts is not None: print("Input String in SLP1:", i.canonical()) for t in ts: print(t) if args.tag_set or args.base: if args.tag_set is not None: g = set(args.tag_set) else: g = None if args.base is not None: b = SanskritBase.SanskritNormalizedString(args.base) else: b = None print(s.hasTag(i, b, g)) else: import time i = SanskritBase.SanskritNormalizedString(args.data, encoding=ie, strict_io=args.strict_io, replace_ending_visarga=None) print("Input String in SLP1:", i.canonical()) print("Start Split") start_split = time.time() graph = s.getSandhiSplits(i) end_graph = time.time() print("End DAG generation") if graph: logger.debug("Graph has %d nodes and %d edges" % (len(graph.G.nodes()), len(graph.G.edges()))) splits = graph.find_all_paths(max_paths=args.max_paths, score=args.score) print("End pathfinding", time.time()) print("Splits:") if splits: for split in splits: print(split) else: print("None") else: print("No Valid Splits Found") end_split = time.time() print("-----------") print("Performance") print("Time for graph generation = {0:0.6f}s".format(end_graph - start_split)) print("Total time for graph generation + find paths = {0:0.6f}s".format(end_split - start_split)) return graph
if __name__ == "__main__": main()