# -*- coding: utf-8 -*-
"""
Code Usage
----------
The ``Parser`` class can be used to generate vakya parses thus:
.. code-block:: python
from sanskrit_parser import Parser
string = "astyuttarasyAMdiSi"
input_encoding = "SLP1"
output_encoding = "SLP1"
parser = Parser(input_encoding=input_encoding,
output_encoding=output_encoding,
replace_ending_visarga='s')
print('Splits:')
for split in parser.split(string, limit=10):
print(f'Lexical Split: {split}')
for i, parse in enumerate(split.parse(limit=2)):
print(f'Parse {i}')
print(f'{parse}')
break
This produces the output::
Lexical Split: ['asti', 'uttarasyAm', 'diSi']
Parse 0
asti => (asti, ['samAsapUrvapadanAmapadam', 'strIliNgam']) : samasta of uttarasyAm
uttarasyAm => (uttara#1, ['saptamIviBaktiH', 'strIliNgam', 'ekavacanam'])
diSi => (diS, ['saptamIviBaktiH', 'ekavacanam', 'strIliNgam']) : viSezaRa of uttarasyAm
Parse 1
asti => (asti, ['samAsapUrvapadanAmapadam', 'strIliNgam']) : samasta of uttarasyAm
uttarasyAm => (uttara#2, ['saptamIviBaktiH', 'strIliNgam', 'ekavacanam']) : viSezaRa of diSi
diSi => (diS#2, ['saptamIviBaktiH', 'strIliNgam', 'ekavacanam'])
Parse 2
asti => (as#1, ['kartari', 'praTamapuruzaH', 'law', 'parasmEpadam', 'ekavacanam', 'prATamikaH'])
uttarasyAm => (uttara#2, ['saptamIviBaktiH', 'strIliNgam', 'ekavacanam']) : viSezaRa of diSi
diSi => (diS, ['saptamIviBaktiH', 'ekavacanam', 'strIliNgam']) : aDikaraRam of asti
"""
import time
import json
import abc
import warnings
from dataclasses import dataclass
from typing import Sequence
from sanskrit_parser.base.sanskrit_base import SCHEMES, SanskritObject, SLP1
from sanskrit_parser.base.sanskrit_base import SanskritNormalizedString, SanskritString
from sanskrit_parser.parser.sandhi_analyzer import LexicalSandhiAnalyzer
from sanskrit_parser.parser.datastructures import VakyaGraph, VakyaGraphNode
import logging
logger = logging.getLogger(__name__)
[docs]class Serializable(abc.ABC):
""" Base class to indicate an object is serializable into JSON """
[docs] @abc.abstractmethod
def serializable(self):
''' Return an object that can be serialized by json.JSONEncoder '''
pass
[docs]class JSONEncoder(json.JSONEncoder):
[docs] def default(self, o):
if isinstance(o, Serializable):
return o.serializable()
return super().default(o)
[docs]class Parser():
def __init__(self, strict_io: bool = False, input_encoding: str = None,
output_encoding: str = 'SLP1', lexical_lookup: str = "combined",
score: bool = True, split_above: int = 5,
replace_ending_visarga: str = None, fast_merge: bool = True):
self.strict_io = strict_io
if input_encoding is not None:
self.input_encoding = SCHEMES[input_encoding]
else:
self.input_encoding = None
self.output_encoding = SCHEMES[output_encoding]
self.lexical_lookup = lexical_lookup
self.score = score
self.split_above = split_above
self.replace_ending_visarga = replace_ending_visarga
self.fast_merge = fast_merge
self.sandhi_analyzer = LexicalSandhiAnalyzer(self.lexical_lookup)
def _maybe_pre_segment(self, input_string: str, pre_segmented: bool
):
''' Pre-process pre-segmented input if necessary '''
if not pre_segmented:
s = SanskritNormalizedString(input_string,
encoding=self.input_encoding,
strict_io=self.strict_io,
replace_ending_visarga=self.replace_ending_visarga)
logger.info(f"Input String in SLP1: {s.canonical()}")
return s
else:
logger.debug("Pre-Segmented")
s = []
for seg in input_string.split(" "):
o = SanskritObject(seg,
encoding=self.input_encoding,
strict_io=self.strict_io,
replace_ending_visarga='r')
ts = self.sandhi_analyzer.getMorphologicalTags(o, tmap=True)
if ts is None:
# Possible sakaranta
# Try by replacing end visarga with 's' instead
o = SanskritObject(seg,
encoding=self.input_encoding,
strict_io=self.strict_io,
replace_ending_visarga='s')
ts = self.sandhi_analyzer.getMorphologicalTags(o, tmap=True)
if ts is None:
logger.warning(f"Unknown pada {seg} - will be split")
_s = list(self.split(seg, pre_segmented=False, limit=1))[0]
logger.info(f"Split {_s}")
s.extend(_s.split)
if _s is None:
logger.warning(f"Unknown pada {seg} - cannot be split")
else:
s.append(o)
logger.info(f"Input String in SLP1: {' '.join([x.canonical() for x in s])}")
return s
[docs] def split(self,
input_string: str,
limit: int = 10,
pre_segmented: bool = False,
dot_file=None,
):
s = self._maybe_pre_segment(input_string, pre_segmented)
logger.debug("Start Split")
graph = self.sandhi_analyzer.getSandhiSplits(s, tag=True,
pre_segmented=pre_segmented)
logger.debug("End DAG generation")
if graph is None:
warnings.warn("No splits found. Please check the input to ensure there are no typos.")
return None
else:
if dot_file is not None:
graph.write_dot(dot_file)
splits = graph.find_all_paths(max_paths=limit,
sort=True,
score=self.score)
return [Split(self, input_string, split)
for split in splits]
[docs]@dataclass
class Split(Serializable):
parser: Parser
input_string: str
split: Sequence[SanskritObject]
vgraph: VakyaGraph = None
def __repr__(self):
return f'Split({self.input_string}) = {self.split}'
def __str__(self):
strict_io = self.parser.strict_io
encoding = self.parser.output_encoding
out = [t.transcoded(encoding, strict_io) for t in self.split]
return str(out)
[docs] def parse(self, limit=10, min_cost_only=False):
self.vgraph = VakyaGraph(self.split,
fast_merge=self.parser.fast_merge,
max_parse_dc=self.parser.split_above)
parses = self.vgraph.parses[:limit]
costs = self.vgraph.parse_costs[:limit]
min_cost = min(costs) if len(costs) else 0
if min_cost_only:
parses = [x for x, cost in zip(parses, costs) if cost == min_cost]
return [Parse(self, parse, cost) for parse, cost in zip(parses, costs)]
[docs] def write_dot(self, basepath):
self.vgraph.write_dot(basepath)
[docs] def serializable(self):
strict_io = self.parser.strict_io
encoding = self.parser.output_encoding
out = [t.transcoded(encoding, strict_io) for t in self.split]
return {'split': out,
'parses': list(self.parse())
}
[docs]@dataclass
class ParseTag(Serializable):
root: str
tags: Sequence[str]
[docs] def serializable(self):
return {'root': self.root, 'tags': self.tags}
def __str__(self):
return f'({self.root}, {self.tags})'
[docs]class ParseNode(Serializable):
def __init__(self, node: VakyaGraphNode,
strict_io: bool,
encoding: str):
self.pada = node.pada.transcoded(encoding, strict_io)
tag = node.getMorphologicalTags()
self.parse_tag = ParseTag(tag[0].transcoded(encoding, strict_io),
[t.transcoded(encoding, strict_io) for t in tag[1]]
)
self.index = node.index
def __str__(self):
return f'{self.pada} => {self.parse_tag}'
[docs] def serializable(self):
d = {'pada': self.pada}
d.update(self.parse_tag.serializable())
return d
[docs]@dataclass
class ParseEdge(Serializable):
predecessor: ParseNode
node: ParseNode
label: str
def __str__(self):
return f'{self.node} : {self.label} of {self.predecessor.pada}'
[docs] def serializable(self):
return {'node': self.node,
'predecessor': self.predecessor,
'sambandha': self.label
}
[docs]class Parse(Serializable):
def __init__(self, split: Split, parse_graph, cost):
strict_io = split.parser.strict_io
encoding = split.parser.output_encoding
graph = []
for n in sorted(list(parse_graph), key=lambda x: x.index):
node = ParseNode(n, strict_io, encoding)
preds = list(parse_graph.predecessors(n))
if preds:
pred = preds[0] # Only one
# Multigraph
# Ugh - surely this can be better
lbl = list(parse_graph[pred][n].values())[0]['label']
pred_node = ParseNode(pred, strict_io, encoding)
edge = ParseEdge(pred_node,
node,
SanskritString(lbl, encoding=SLP1).transcoded(encoding, strict_io)
)
graph.append(edge)
else:
graph.append(node)
self.graph = graph
self.cost = cost
self.parse_graph = parse_graph
def __str__(self):
return '\n'.join([str(t) for t in self.graph])
def __iter__(self):
return iter(self.graph)
[docs] def to_conll(self):
r = []
for t in self.graph:
if isinstance(t, ParseNode):
r.append([str(t.index+1), str(t.pada), "_",
str(t.parse_tag.root), str(t.parse_tag.tags),
"0", "root"])
else:
r.append([str(t.node.index+1), str(t.node.pada), "_",
str(t.node.parse_tag.root), str(t.node.parse_tag.tags),
str(t.predecessor.index+1), str(t.label)])
return r
[docs] def to_dot(self):
from io import StringIO
import networkx as nx
s = StringIO()
nx.drawing.nx_pydot.write_dot(self.parse_graph, s)
return s.getvalue()
[docs] def serializable(self):
return {'graph': self.graph}
if __name__ == "__main__":
start_time = time.time()
def api_example(string, output_encoding):
parser = Parser(output_encoding=output_encoding,
replace_ending_visarga='s')
print('Splits:')
for split in parser.split(string, limit=10):
print(f'Lexical Split: {split}')
for i, parse in enumerate(split.parse(limit=2)):
print(f'Parse {i}')
print(f'{parse}')
break
print(json.dumps(split,
ensure_ascii=False,
indent=2,
cls=JSONEncoder)
)
def main():
examples = [('devadattogrAmaMgacCati', 'SLP1'),
('astyuttarasyAMdishidevatAtmA', 'Devanagari')
]
for string, encoding in examples:
api_example(string, encoding)
main()
print(f'Took {time.time() - start_time} s')