#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
""" Sanskrit Parser Data Structures
@author: Karthik Madathil (github: @kmadathil)
"""
from sanskrit_parser.base.sanskrit_base import SanskritObject, SanskritImmutableString, SLP1
import networkx as nx
from itertools import islice, product
import logging
import operator
from copy import copy
import six
import time
from collections import defaultdict
from os.path import dirname, basename, splitext, join
from sanskrit_parser.util import lexical_scorer
from sanskrit_parser.util.disjoint_set import DisjointSet
from sanskrit_parser.util.DhatuWrapper import DhatuWrapper
__all__ = ['SandhiGraph', 'VakyaGraph', 'VakyaParse', 'getSLP1Tagset']
dw = DhatuWrapper()
logger = logging.getLogger(__name__)
[docs]class SandhiGraph(object):
""" DAG class to hold Sandhi Lexical Analysis Results
Represents the results of lexical sandhi analysis as a DAG
Nodes are SanskritObjects
"""
start = "__start__"
end = "__end__"
def __init__(self):
''' DAG Class Init
Params:
elem (SanskritObject :optional:): Optional initial element
end (bool :optional:): Add end edge to initial element
'''
self.roots = []
self.G = nx.DiGraph()
self.scorer = lexical_scorer.Scorer()
def __iter__(self):
''' Iterate over nodes '''
return self.G.__iter__()
[docs] def has_node(self, t):
''' Does a given node exist in the graph?
Params:
t (SanskritObject): Node
Returns:
boolean
'''
return t in self.G
[docs] def append_to_node(self, t, nodes):
""" Create edges from t to nodes
Params:
t (SanskritObject) : Node to append to
nodes (iterator(nodes)) : Nodes to append to t
"""
# t is in our graph
assert t in self.G
for r in nodes:
self.G.add_edge(t, r)
[docs] def add_node(self, node):
""" Extend dag with node inserted at root
Params:
Node (SanskritObject) : Node to add
root (Boolean) : Make a root node
end (Boolean) : Add an edge to end
"""
assert node not in self.G
self.G.add_node(node)
[docs] def add_end_edge(self, node):
''' Add an edge from node to end '''
assert node in self.G
self.G.add_edge(node, self.end)
[docs] def add_roots(self, roots):
self.roots.extend(roots)
[docs] def lock_start(self):
''' Make the graph ready for search by adding a start node
Add a start node, add arcs to all current root nodes, and clear
self.roots
'''
self.G.add_node(self.start)
for r in self.roots:
self.G.add_edge(self.start, r)
self.roots = []
[docs] def score_graph(self):
edges = self.G.edges()
edges_list = []
edges_to_score = []
for edge in edges:
edges_list.append(edge)
if edge[0] == self.start:
edges_to_score.append([edge[1]])
elif edge[1] == self.end:
edges_to_score.append([edge[0]])
else:
edges_to_score.append(edge)
scores = self.scorer.score_splits(edges_to_score)
for edge, score in zip(edges_list, scores):
# Score is log-likelihood, so higher is better.
# For graph path-finding, smaller weight is better, so use negative
edges[edge]['weight'] = -score
for u, v, w in self.G.edges(data='weight'):
logger.debug("u = %s, v = %s, w = %s", u, v, w)
[docs] def find_all_paths(self, max_paths=10, sort=True, score=True):
""" Find all paths through DAG to End
Params:
max_paths (int :default:=10): Number of paths to find
If this is > 1000, all paths will be found
sort (bool) : If True (default), sort paths
in ascending order of length
"""
if self.roots:
self.lock_start()
if score:
self.score_graph()
# shortest_simple_paths is slow for >1000 paths
if max_paths <= 1000:
if score:
paths = list(six.moves.map(lambda x: x[1:-1],
islice(nx.shortest_simple_paths(
self.G, self.start, self.end, weight='weight'),
max_paths)))
scores = self.scorer.score_splits(paths)
path_scores = zip(paths, scores)
sorted_path_scores = sorted(path_scores, key=operator.itemgetter(1), reverse=True)
logger.debug("Sorted paths with scores:\n %s", sorted_path_scores)
# Strip the scores from the returned result, to be consistent with no-scoring option
sorted_paths, _ = zip(*sorted_path_scores)
return list(sorted_paths)
else:
paths = list(six.moves.map(lambda x: x[1:-1],
islice(nx.shortest_simple_paths(
self.G, self.start, self.end),
max_paths)))
return paths
else: # Fall back to all_simple_paths
ps = list(six.moves.map(lambda x: x[1:-1],
nx.all_simple_paths(self.G, self.start, self.end)))
# If we do not intend to display paths, no need to sort them
if sort:
ps.sort(key=lambda x: len(x))
return ps
def __str__(self):
""" Print representation of DAG """
return str(self.G)
[docs] def draw(self, *args, **kwargs):
_ncache = {}
def _uniq(s):
if s not in _ncache:
_ncache[s] = 0
return s
else:
_ncache[s] = _ncache[s] + 1
return s + "_" + str(_ncache[s])
nx.draw(self.G, *args, **kwargs,
pos=nx.spring_layout(self.G),
labels={x: _uniq(str(x)) for x in self})
[docs] def write_dot(self, path):
# H = nx.convert_node_labels_to_integers(G, label_attribute=’node_label’)
# H_layout = nx.nx_pydot.pydot_layout(G, prog=’dot’)
# G_layout = {H.nodes[n][‘node_label’]: p for n, p in H_layout.items()}
nx.drawing.nx_pydot.write_dot(self.G, path)
lakaras = set(['law', 'liw', 'luw', 'lrw', 'low', 'laN', 'liN', 'luN', 'lfN',
'viDiliN', 'ASIrliN'])
krtverbs = set(['ktvA', 'Satf', 'SAnac', 'tumun', 'kta', 'ktavatu', 'lyap'])
purvakala = set(['ktvA', 'lyap'])
samanakala = set(['Satf', 'Sanac'])
nishta = set(['kta', 'ktavatu'])
karmani = set(['karmaRi'])
samastas = set(['samAsapUrvapadanAmapadam'])
nijanta = set(['RijantaH'])
# Vibhaktis
_vibhaktis = ['praTamAviBaktiH', 'dvitIyAviBaktiH', 'tftIyAviBaktiH',
'caturTIviBaktiH', 'paYcamIviBaktiH', 'zazWIviBaktiH',
'saptamIviBaktiH', 'saMboDanaviBaktiH']
prathama = _vibhaktis[1-1]
dvitiya = _vibhaktis[2-1]
tritiya = _vibhaktis[3-1]
chaturthi = _vibhaktis[4-1]
pancami = _vibhaktis[5-1]
shashthi = _vibhaktis[6-1]
saptami = _vibhaktis[7-1]
sambodhana = _vibhaktis[8-1]
vibhaktis = set(_vibhaktis)
# Vacanas
vacanas = set(['ekavacanam', 'dvivacanam', 'bahuvacanam'])
# Puruzas
puruzas = 'praTamapuruzaH', 'maDyamapuruzaH', 'uttamapuruzaH'
karakas = set(['kartA', 'karma', 'karaRam', 'apAdAnam', 'sampradAnam', 'aDikaraRam', 'hetu-kartA'])
predicative_verbs = set(['as', 'BU', 'vft'])
_lingas = ['puMlliNgam', 'napuMsakaliNgam', 'strIliNgam', 'triliNgam']
lingas = set(_lingas)
napumsakam = _lingas[1]
avyaya = set(['avyayam'])
kriyavisheshana = set(['kriyAviSezaRam'])
nishedha = set(['na'])
# FIXME - api and su (pUjAyaM) in here temporarily - need a better solution
karmap_null = set(['su', 'api'])
avyaya_kriyav = set(['kila', 'bata', 'aho', 'nanu', 'hanta', 'eva', 'tu'])
projlabels = karakas.union(kriyavisheshana)
# sambaddha links are projective
samplabels = {'sambadDa-'+lbl for lbl in projlabels}.union({'saMbadDakriyA'})
projlabels.update(samplabels)
sentence_conjunctions = {"yad": {"tad", None},
"yadi": {"tarhi"},
"yatra": {"tatra"},
"yAvat": {"tAvat"},
"yadA": {"tadA"},
"yaTA": {"taTA"},
"api": {None},
"cet": {None},
"natu": {None},
}
conjunctions = {"ca"}
disjunctions = {"uta", "vA"}
# Non Karaka Vibhakti
non_karaka_vibhaktis = {
2: ["antarA", "antareRa", "pfTak", "vinA", "nAnA"],
3: ["saha", "pfTak", "vinA", "nAnA"],
4: ["namaH", "svasti", "svAhA", "alam", "vazaw"],
5: ["anya", "Arat", "itara", "fte", "pfTak", "vinA", "nAnA"],
}
# FIXME Merge these with above
karmap_2 = set(['anu', 'upa', 'prati', 'aBi', 'aDi', 'ati'])
karmap_5 = set(['apa', 'pari', 'A', 'prati'])
# Edge costs used for ordering
edge_cost = defaultdict(lambda: 1)
for k in karakas:
edge_cost[k] = 0.9
edge_cost['karma'] = 0.85
edge_cost['kartA'] = 0.8
edge_cost['samasta'] = 0.7
edge_cost['samuccitam'] = 0.7
edge_cost['upasargaH'] = 0.65
# for k in karakas:
# edge_cost['sambadDa-'+k] = edge_cost[k]
edge_cost['vAkyasambanDaH'] = 0.3
# edge_cost['zazWI-sambanDa'] = 1
edge_cost_const = ['vAkyasambanDaH', 'samuccitam', 'BAvalakzaRam']
[docs]class VakyaGraph(object):
""" DAG class for Sanskrit Vakya Analysis
Represents Sanskrit Vakya Analysis as a DAG
Nodes are SanskritObjects with morphological tags
Edges are potential relationships between them
"""
def __init__(self, path, max_parse_dc=4, fast_merge=True):
''' DAG Class Init
Params:
path: Path from SandhiGraph
'''
self.max_parse_dc = max_parse_dc
self.fast_merge = fast_merge
self.roots = []
self.isLocked = False
# Multigraph
self.G = nx.MultiDiGraph() # Allow parallel edges
# Need this many nodes in the extracted subgraphs
self.path_node_count = len(path)
logger.debug(f"{self.path_node_count} sets of orthogonal nodes")
self.partitions = []
for (ix, sobj) in enumerate(path):
vnlist = []
mtags = sobj.getMorphologicalTags()
for mtag in mtags:
ncopy = SanskritObject(sobj.canonical(),
encoding=SLP1,
replace_ending_visarga=None)
ncopy.setMorphologicalTags(mtag)
pn = VakyaGraphNode(ncopy, ix)
vnlist.append(pn)
for vn in vnlist:
self.add_node(vn)
self.partitions.append(set(vnlist))
logger.debug(f"Node Partitions {self.partitions} Len {len(self.partitions)}")
self.lock()
self.add_edges()
# Remove isolated nodes (with no edges)
isolates = list(nx.isolates(self.G))
self.G.remove_nodes_from(isolates)
for (ix, s) in enumerate(self.partitions):
s.difference_update(isolates)
if len(s) == 0:
logger.error(f"Partition {ix}: {path[ix]} went to zero length!")
start_parse = time.time()
self.parses = self.get_parses_dc()
end_parse = time.time()
self.check_parse_validity()
end_check = time.time()
self.parses, self.parse_costs = _order_parses(self.parses)
logger.info(f"Time for parse: {(end_parse-start_parse):1.6f}s")
logger.info(f"Total for Global Check: {(end_check-end_parse):1.6f}s")
logger.info(f"Total Time for parse: {(end_check-start_parse):1.6f}s")
def __iter__(self):
''' Iterate over nodes '''
return self.G.__iter__()
def __str__(self):
""" Print representation of DAG """
return str(self.G)
[docs] def lock(self):
self.isLocked = True
[docs] def add_node(self, node):
""" Extend dag with node inserted at root
Params:
Node (VakyaGraphNode) : Node to add
"""
assert node not in self.G
assert not self.isLocked
self.G.add_node(node)
[docs] def add_edges(self):
assert self.isLocked
laks = self.find_lakaras()
krts = self.find_krtverbs()
bases = []
bases.extend(laks)
bases.extend(krts)
logger.debug("Processing Conjuctions")
self.add_conjunctions(bases) # Needs to happen before kAraka
logger.debug("Adding Edges")
self.add_karakas(bases)
self.add_samastas()
self.add_shashthi()
self.add_kriyavisheshana(bases) # FIXME Parallel edge problem
self.add_visheshana()
self.add_kriya_kriya(laks, krts)
self.add_avyayas(bases)
self.add_bhavalakshana(krts, laks)
self.add_non_karaka_vibhaktis()
self.add_vipsa()
self.add_sentence_conjunctions(laks, krts)
[docs] def find_krtverbs(self):
''' Find non ti~Nanta verbs'''
rlist = []
for n in self.G:
if n.node_is_a(krtverbs):
logger.debug(f"{n} is a possible Krt Dhatu")
rlist.append(n)
return rlist
[docs] def find_lakaras(self):
''' Find the ti~Nanta '''
rlist = []
for n in self.G:
if n.node_is_a(lakaras):
logger.debug(f"{n} is a possible Dhatu")
rlist.append(n)
return rlist
[docs] def add_visheshana(self):
for n in self.G:
if n.node_is_a(vibhaktis):
for no in self.G:
if (not _is_same_partition(n, no)) and match_linga_vacana_vibhakti(n, no):
if _get_base(n) != _get_base(no):
logger.debug(f"Adding viSezaRa edge: {n,no}")
self.G.add_edge(n, no, label="viSezaRam")
[docs] def add_vipsa(self):
for n in self.G:
for no in self.G:
if (n.index == (no.index-1)) and \
(n.pada.canonical() == no.pada.canonical()):
logger.debug(f"Adding vIpsa edge: {n, no}")
self.G.add_edge(n, no, label="vIpsA")
[docs] def add_samastas(self):
''' Add samasta links from next samasta/tiN '''
for (i, s) in enumerate(self.partitions):
for n in s:
# If node is a samasta, check options for
# next node, and add samasta links if tiN
if n.node_is_a(samastas):
# Cant have samasta as last node
if i < (len(self.partitions)-1):
nextset = self.partitions[i+1]
for nn in nextset:
if nn.node_is_a(vibhaktis) or \
nn.node_is_a(samastas):
logger.debug(f"Adding samasta edge: {n,nn}")
self.G.add_edge(nn, n, label="samasta")
[docs] def add_shashthi(self):
''' Add zazWI-sambanDa links to next tiN '''
for (i, s) in enumerate(self.partitions):
for n in s:
# If node is a shashthi, check
# next node, and add links if tiN
if n.node_is_a(shashthi):
# Cant have sambandha open at last node
if i < (len(self.partitions)-1):
nextset = self.partitions[i+1]
for nn in nextset:
if nn.node_is_a(vibhaktis) or \
nn.node_is_a(samastas):
logger.debug(f"Adding shashthi-sambandha edge: {n,nn}")
self.G.add_edge(nn, n, label="zazWI-sambanDa")
[docs] def add_karakas(self, bases):
''' Add karaka edges from base node (dhatu) base '''
for d in bases:
logger.debug(f"Processing {d}")
dh = _get_base(d).canonical()
hpos = dh.find("#")
if hpos != -1:
dh = dh[:hpos]
if d.node_is_a(lakaras) or d.node_is_a('avyayaDAturUpa'):
is_sak = dw.is_sakarmaka(dh)
is_dvik = dw.is_dvikarmaka(dh)
else:
is_sak = True # No way of knowing, set True
is_dvik = False
logger.debug(f"Dhatu: {dh} Sakarmaka {is_sak} Dvikarmaka {is_dvik}")
if d.node_is_a(karmani):
logger.debug("Karmani")
else:
logger.debug("Kartari")
if d.node_is_a(nijanta):
logger.info("Nijanta Dhatu")
for n in self.G:
if not _is_same_partition(d, n):
if d.node_is_a(karmani):
if n.node_is_a(tritiya):
if d.node_is_a(nijanta):
logger.debug(f"Adding hetu-kartA edge to {n}")
self.G.add_edge(d, n, label="hetu-kartA")
logger.debug(f"Adding kartA edge to {n}")
self.G.add_edge(d, n, label="kartA")
elif (n.node_is_a(prathama) and match_purusha_vacana(d, n)
and d.node_is_a(lakaras) and is_sak):
# Only Lakaras and kartari krts are allowed
# Karma
logger.debug(f"Adding karma edge to {n}")
self.G.add_edge(d, n, label="karma")
if is_dvik:
logger.debug(f"Adding gauRakarma edge to {n}")
self.G.add_edge(d, n, label="gauRa-karma")
else:
if n.node_is_a(prathama) and d.node_is_a(lakaras) and match_purusha_vacana(d, n):
if d.node_is_a(nijanta):
logger.debug(f"Adding hetu-kartA edge to {n}")
self.G.add_edge(d, n, label="hetu-kartA")
else:
logger.debug(f"Adding kartA edge to {n}")
self.G.add_edge(d, n, label="kartA")
elif n.node_is_a(tritiya) and d.node_is_a(nijanta):
logger.debug(f"Adding kartA edge to {n}")
self.G.add_edge(d, n, label="kartA")
elif (n.node_is_a(dvitiya) and is_sak):
# Lakaras and Kartari krts are allowed
# Karma
logger.debug(f"Adding karma edge to {n}")
self.G.add_edge(d, n, label="karma")
if is_dvik:
logger.debug(f"Adding gauRakarma edge to {n}")
self.G.add_edge(d, n, label="gauRa-karma")
if n.node_is_a(tritiya):
logger.debug(f"Adding karana edge to {n}")
self.G.add_edge(d, n, label="karaRam")
elif n.node_is_a(chaturthi):
logger.debug(f"Adding sampradana edge to {n}")
self.G.add_edge(d, n, label="sampradAnam")
elif n.node_is_a(pancami):
logger.debug(f"Adding apadana edge to {n}")
self.G.add_edge(d, n, label="apAdanam")
elif n.node_is_a(saptami):
logger.debug(f"Adding adhikarana edge to {n}")
self.G.add_edge(d, n, label="aDikaraRam")
elif n.node_is_a(sambodhana) and check_sambodhya(d, n):
logger.debug(f"Adding sambodhya edge to {n}")
self.G.add_edge(d, n, label="samboDyam")
[docs] def add_kriyavisheshana(self, bases):
''' Add kriyaviSezaRa edges from base node (dhatu) base '''
for d in bases:
for n in self.G:
if not _is_same_partition(d, n):
if n.node_is_a(avyaya) and \
(n.node_is_a(kriyavisheshana) or
_get_base(n) in avyaya_kriyav):
logger.debug(f"Adding kriyAviSezaRa edge to {n}")
self.G.add_edge(d, n, label="kriyAviSezaRam")
[docs] def add_kriya_kriya(self, lakaras, krts):
''' Add kriya-kriya edges from lakaras to krts'''
for d in lakaras:
for n in krts:
if not _is_same_partition(d, n):
if n.node_is_a(purvakala):
logger.debug(f"Adding pUrvakAlaH edge to {n}")
self.G.add_edge(d, n, label="pUrvakAlaH")
elif n.node_is_a('tumun'):
logger.debug(f"Adding prayojanam edge to {n}")
self.G.add_edge(d, n, label="prayojanam")
elif n.node_is_a(samanakala) and n.node_is_a(prathama) and match_purusha_vacana(d, n):
logger.debug(f"Adding samAnakAlaH edge to {n}")
self.G.add_edge(d, n, label="samAnakAlaH")
[docs] def add_conjunctions(self, bases):
''' Add samuccita links for conjunctions/disjunctions '''
# First pass, add samuccitam edges
for (i, s) in enumerate(self.partitions):
for n in s:
if n.node_is_a(avyaya) and ((_get_base(n) in conjunctions) or (_get_base(n) in disjunctions)):
# Add samuccitam to previous node
prevset = self.partitions[i-1]
for nn in prevset:
if nn.node_is_a(vibhaktis):
logger.info(f"Adding samuccitam edge: {n,nn}")
self.G.add_edge(n, nn, label="samuccitam")
vibhakti = nn.get_vibhakti()
n.setTags(vibhakti) # Keep as strings for now since we may delete this
logger.debug(f'Node with temporary vibhakti {n}')
ni = i-2
while ni >= 0:
ppset = self.partitions[ni]
isconj = False
for nnn in ppset:
# logger.info(f'Node in partition {ni} {nnn}')
if nnn.get_vibhakti() == vibhakti:
# if further previous node is the same conjunction, add samuccitam to that and stop
if (ni == (i-2)) and nnn.node_is_a(avyaya) and _get_base(n) == _get_base(nnn):
logger.info(f"Adding samuccitam edge to conj/disjunction: {nn,nnn}")
self.G.add_edge(nn, nnn, label="samuccitam")
isconj = True
elif not ((_get_base(nnn) in conjunctions) or (_get_base(nnn) in disjunctions)):
# Else, If further previous nodes are the same vibhakti, add samuccitam to those
logger.info(f"Adding samuccitam edge to similar vibhakti: {n,nnn}")
self.G.add_edge(n, nnn, label="samuccitam")
# Force exit from while loop if we hit another conjuction/disjunction
# FIXME: unclear how to handle the case where ca/uta/vA can refer to objects (sattva), ignoring
if isconj:
break
else:
ni = ni - 1
elif nn.node_is_a(lakaras):
logger.info(f"Adding samuccitam edge: {n,nn}")
self.G.add_edge(n, nn, label="samuccitam")
lakara = nn.get_lakara()
puruza = nn.get_purusha()
vacana = nn.get_vacana()
n.setTags(lakara | puruza | vacana) # Keep as strings for now since we may delete this
logger.debug(f'Node with temporary lakara {n}')
ni = i-2
while ni >= 0:
ppset = self.partitions[ni]
isconj = False
for nnn in ppset:
# logger.info(f'Node in partition {ni} {nnn}')
if (nnn.get_lakara() == lakara) and match_purusha_vacana(nnn, nn):
# if further previous node is the same conjunction, add samuccitam to that and stop
if (ni == (i-2)) and nnn.node_is_a(avyaya) and _get_base(n) == _get_base(nnn):
logger.info(f"Adding samuccitam edge to conj/disjunction: {nn,nnn}")
self.G.add_edge(nn, nnn, label="samuccitam")
isconj = True
elif not ((_get_base(nnn) in conjunctions) or (_get_base(nnn) in disjunctions)):
# Else, If further previous nodes are the same vibhakti, add samuccitam to those
logger.info(f"Adding samuccitam edge to similar lakara: {n,nnn}")
self.G.add_edge(n, nnn, label="samuccitam")
# Force exit from while loop if we hit another conjuction/disjunction
# FIXME: unclear how to handle the case where ca/uta/vA can refer to objects (sattva), ignoring
if isconj:
break
else:
ni = ni - 1
# Second pass, remove vibhaktis
for (i, s) in enumerate(self.partitions):
for n in s:
if n.node_is_a(avyaya) and ((_get_base(n) in conjunctions) or (_get_base(n) in disjunctions)):
# We would have set a vibhakti or lakara string
v = n.get_vibhakti()
lk = n.get_lakara()
vc = n.get_vacana()
p = n.get_purusha()
for e in self.G.in_edges(n, data=True): # Note: keys=False
if e[2]['label'] == 'samuccitam':
if v:
logger.debug(f'removing vibhakti {v} from {n}')
n.deleteTags(v)
logger.debug(f'removed vibhakti {v} from {n}')
if lk:
logger.debug(f'removing lakara {lk | vc | p} from {n}')
n.deleteTags(lk | vc | p)
logger.debug(f'removed lakara {lk | vc | p} from {n}')
# Still there!
if lk and n.node_is_a(lk): # Lakara need not be set
# logger.info(f'Node before lakara locked {n}')
n.deleteTags(lk | vc | p | v)
# n.setTags(set([SanskritObject(_v, SLP1) for _v in list(lk | vc | p)]))
logger.info(f'Node with lakara removed {n}')
elif v and n.node_is_a(v):
n.deleteTags(v)
n.setTags(set([SanskritObject(_v, SLP1) for _v in list(v)]))
logger.info(f'Node with vibhakti locked {n}')
# Compute vacanam and lingam: Conjunctions
is_conj = (_get_base(n) in conjunctions)
vacana = ''
linga = 'strIliNgam'
def _vacana(v1, v2, is_conj=True):
logger.debug(f'{v1, v2, is_conj}')
if is_conj:
if v1 == 'ekavacanam' and v2 == 'ekavacanam':
return 'dvivacanam'
else:
return 'bahuvacanam'
else:
if v1 == 'ekavacanam' and v2 == 'ekavacanam':
return 'ekavacanam'
elif v1 == 'bahuvacanam' or v2 == 'bahuvacanam':
return 'bahuvacanam'
else:
return 'dvivacanam'
def _linga(l1, l2):
logger.debug(f'{l1,l2}')
if l1 == 'napuMsakaliNgam' or l2 == 'napuMsakaliNgam':
return 'napuMsakaliNgam'
elif l1 == 'puMlliNgam' or l2 == 'puMlliNgam':
return 'puMlliNgam'
else:
return 'strIliNgam'
# Depth first (we only get samuccitam edges)
for (nn, sl) in nx.dfs_successors(self.G, source=n).items():
for s in sl:
logger.info(f'DFS {s} from {nn} root {n}')
_v = s.get_vacana()
_l = s.get_linga()
if vacana == '':
vacana = list(_v)[0]
elif _v:
vacana = _vacana(vacana, list(_v)[0], is_conj)
if _l:
linga = _linga(linga, list(_l)[0])
n.setTags({SanskritObject(vacana, SLP1), SanskritObject(linga, SLP1)})
logger.info(f'Node with vacana/linga locked {n}')
[docs] def add_avyayas(self, bases):
''' Add Avyaya Links '''
# Upasargas
for (i, s) in enumerate(self.partitions):
for n in s:
if n.node_is_a('upasargaH'):
# Cant have upasargas at last node
if i < (len(self.partitions)-1):
nextset = self.partitions[i+1]
# Upasarga to upasarga links ok
# Upasarga to any verb form barring ktvA
for nn in nextset:
if ((nn in bases) and not nn.node_is_a('ktvA')) or \
nn.node_is_a('upasargaH'):
logger.debug(f"Adding upasarga edge: {n,nn}")
self.G.add_edge(nn, n, label="upasargaH")
elif n.node_is_a(avyaya) and (_get_base(n) in nishedha):
for b in bases:
if not _is_same_partition(n, b):
logger.debug(f"Adding nishedha edge: {n, b}")
self.G.add_edge(b, n, label="nizeDa")
elif n.node_is_a('karmapravacanIyaH') and not (_get_base(n) in avyaya_kriyav) and not(_get_base(n) in karmap_null):
for b in bases:
if not _is_same_partition(n, b):
logger.debug(f"Adding karmapravacaniya edge: {n, b}")
self.G.add_edge(b, n, label="karmapravacanIyaH")
if i < (len(self.partitions)-1):
nextset = self.partitions[i+1]
else:
nextset = set([])
if i > 0:
prevset = self.partitions[i-1]
else:
prevset = set([])
for nn in nextset.union(prevset):
if nn.node_is_a(dvitiya) and (_get_base(n) in karmap_2):
logger.debug(f"Adding karmapravacaniya upapada 2 edge: {n,nn}")
self.G.add_edge(n, nn, label="upapada-dvitIyA")
elif nn.node_is_a(pancami) and (_get_base(n) in karmap_5):
logger.debug(f"Adding karmapravacaniya upapada 5 edge: {n,nn}")
self.G.add_edge(n, nn, label="upapada-pancamI")
[docs] def add_bhavalakshana(self, krts, laks):
''' Add bhavalakshana edges from saptami krts to lakaras '''
for k in krts:
if k.node_is_a(saptami):
for lak in laks:
if not _is_same_partition(k, lak):
logger.debug(f"Adding Bhavalakshana edge: {k, lak}")
self.G.add_edge(lak, k, label="BAvalakzaRam")
[docs] def add_non_karaka_vibhaktis(self):
''' Add Non-Karaka Vibhaktis '''
for n in self.G:
nb = _get_base(n)
for ix in range(7):
if (ix+1 in non_karaka_vibhaktis) and \
(nb in non_karaka_vibhaktis[ix+1]):
for nn in self.G:
vlabel = ("upapada-"+_vibhaktis[ix]).replace("viBakti", "")
if nn.node_is_a(_vibhaktis[ix]) and \
(not _is_same_partition(nn, n)):
logger.debug(f"Adding {vlabel} edge {nn, n}")
self.G.add_edge(n, nn, label=vlabel)
[docs] def add_sentence_conjunctions(self, laks, krts):
''' Add sentence conjunction links
For all nodes which match sentence_conjuction keys
add vAkyasambanDaH link between y-t pair (if vibhakti matches where relevant)
Reverse all edges to node, add sambadDa- to link label (eg sambadDa-karma, if node is not vIpsa
if node is saMyojakaH, and not vIpsA add saMbadDakriyA links to verbs
if associated t* doesn't exist vAkyasambanDaH links from verbs
'''
def _is_vipsa(n):
for p, kd in self.G.pred[n].items():
# Multigraph
for k, d in kd.items():
if d['label'] == 'vIpsA':
return True
return False
# Only prathama krts are needed here. The rest aren't relevant
bases = []
bases.extend(laks)
bases.extend([n for n in krts if n.node_is_a(prathama)])
sentence_conjunctions_y = set(sentence_conjunctions.keys())
for n in self.G:
nb = _get_base(n)
if nb in sentence_conjunctions_y:
if not _is_vipsa(n):
# Reverse edges, add sambadDa label
# for MG, G.pred[n].items() is an iterator of (node, edgeitem) pairs
for p, kd in list(self.G.pred[n].items()):
# edgeitem.items() is an iterator of num, dict pairs
for k, d in list(kd.items()):
# self.G.remove_edge(p, n)
if d['label'] == 'viSezaRam':
# Need to invert following edge as well
for p_p, p_kd in list(self.G.pred[p].items()):
for p_k, p_d in list(p_kd.items()):
# self.G.remove_edge(p, n)
if p_d['label'][:9] != 'sambadDa-':
self.G.add_edge(p, p_p, label='sambadDa-'+p_d['label'])
self.G.add_edge(n, p, label='sambadDa-'+d['label'])
# Matching pair
s_t = sentence_conjunctions[nb]
for nn in self.G:
if (not _is_vipsa(nn)) and (_get_base(nn) in s_t) and (not _is_same_partition(n, nn)):
if match_linga_vacana(n, nn):
logger.info(f"Adding vAkyasambanDaH edge: {nn, n}")
self.G.add_edge(nn, n, label="vAkyasambanDaH")
if n.node_is_a('saMyojakaH') and (nn in bases) and (not _is_same_partition(n, nn)):
logger.info(f"Adding saMbadDakriyA edge: {nn, n}")
self.G.add_edge(n, nn, label='saMbadDakriyA')
if None in s_t:
logger.info(f"Adding vAkyasambanDaH edge: {nn, n}")
self.G.add_edge(nn, n, label="vAkyasambanDaH")
[docs] def get_parses_dc(self):
''' Returns all parses
Uses modified Kruskal Algorithm to compute (generalized) spanning
tree of k-partite VakyaGraph
'''
logger.debug("Computing Parses (Divide & Conquer)")
def _get_parse_sub(mn, mx):
# Iterate over subsets of disjoint nodesets
for (i, ns) in enumerate(islice(self.partitions, mn, mx)):
logger.debug(f"Node set number {i} {ns}")
if i == 0:
# Partial parses = phi + all predecessors
partial_parses = set()
partial_parses.add(VakyaParse(None)) # Null partial parse
# For all input edges to this set
for n in ns:
logger.debug(f"Traversing node {n}")
for pred in self.G.predecessors(n):
logger.debug(f"Traversing predecessor {pred} -> {n}")
partial_parses.add(VakyaParse((pred, n)))
else:
store_parses = set()
small_parses = set()
for ps in partial_parses: # For each partial parse
if len(ps) < i: # Small parses to be removed
small_parses.add(ps)
for n in ns: # For all input edges to this set
logger.debug(f"Traversing node {n}")
for pred in self.G.predecessors(n):
logger.debug(f"Traversing predecessor {pred} -> {n}")
# If edge is compatible with partial parse, add and create new partial parse
logger.debug(f"Trying to extend parse {ps}")
if ps.is_safe(pred, n):
logger.debug(f"{pred} - {n} is safe for {ps}")
psc = ps.copy() # Copy the nodeset and DisjointSet structures
psc.extend(pred, n)
store_parses.add(psc)
partial_parses.difference_update(small_parses)
partial_parses.update(store_parses)
logger.debug("Partial Parses")
for p in partial_parses:
logger.debug(p)
return partial_parses
# Divide & Conquer routine
def _dc(mn, mx):
logger.info(f"Divide And Conquer {mn, mx}")
if (mx - mn) > self.max_parse_dc:
md = int((mx + mn)/2)
return _merge_partials(_dc(mn, md), _dc(md, mx), mx, mn)
else:
t = _get_parse_sub(mn, mx)
return t
def _merge_partials(pp1, pp2, mx, mn):
logger.info(f"Merging between {mn, mx} {len(pp1)} x {len(pp2)}")
start_time = time.time()
logger.debug(f"Merging {pp1, pp2}")
ppmt = set()
for ppa in pp1:
for ppb in pp2:
if self.fast_merge:
if ppa.can_merge(ppb, mx-mn-1):
ppmt.add(ppa.merge_f(ppb))
else: # Slow Merge
merged = ppa.merge_s(ppb, mx-mn-1)
if merged:
ppmt.add(merged)
logger.debug(f"{len(ppmt)} parses")
logger.debug(f"Merged {ppmt}")
end_time = time.time()
logger.info(f"Time for merge {end_time-start_time}")
return ppmt
partial_parses = _dc(0, self.path_node_count)
logger.debug(f"Partial Parses Before Return {partial_parses}")
# Multigraph
return set([self.G.edge_subgraph(m) for p in partial_parses for m in multiedgesets(p, self.G)])
[docs] def check_parse_validity(self):
''' Validity Check for parses
Remove parses with double kArakas
Remove parses with multiple to edges into a node
Remove parses with cycles
'''
iv = set()
logger.debug(f"Parses before validity check {len(self.parses)}")
for p in self.parses:
if not _check_parse(p):
logger.debug(f"Will remove {p}")
iv.add(p) # Collect invalid parses
# Remove them
self.parses.difference_update(iv)
logger.info(f"Parses after validity check {len(self.parses)}")
[docs] def draw(self, *args, **kwargs):
_ncache = {}
def _uniq(s):
if s not in _ncache:
_ncache[s] = 0
return s
else:
_ncache[s] = _ncache[s] + 1
return s + "_" + str(_ncache[s])
nx.draw(self.G, *args, **kwargs,
pos=nx.graph_layout(self),
labels={x: _uniq(str(x)) for x in self})
[docs] def write_dot(self, path):
nx.drawing.nx_pydot.write_dot(self.G, path)
d = dirname(path)
be = basename(path)
b, e = splitext(be)
logger.debug(f"Path {d} {b} {e}")
for i, p in enumerate(self.parses):
pt = join(d, b+f"_parse{i}"+e)
nx.drawing.nx_pydot.write_dot(p, pt)
[docs] def get_dot_dict(self):
from io import StringIO
s = StringIO()
r = {}
nx.drawing.nx_pydot.write_dot(self.G, s)
r["split"] = s.getvalue()
for i, p in enumerate(self.parses):
s = StringIO()
nx.drawing.nx_pydot.write_dot(p, s)
r[i] = s.getvalue()
return r
class VakyaGraphNode(object):
""" Class for VakyaGraph nodes
This has pada (a SanskritObject) plus a list of disjoint nodes
to which edges cannot be created from this node
"""
def __init__(self, sobj, index):
self._pada = sobj
self._index = index
self._cache_str()
@property
def pada(self):
return self._pada
@pada.setter
def _set_pada(self, pada):
self._pada = pada
self._cache_str()
@property
def index(self):
return self._index
@index.setter
def _set_index(self, index):
self._index = index
self._cache_str()
def _cache_str(self):
''' Cache the string representation for speedup.
See https://github.com/kmadathil/sanskrit_parser/issues/160
'''
self._str = str(self._pada) + " " + str(self._pada.getMorphologicalTags()) + \
" " + str(self._index)
def getMorphologicalTags(self):
return self._pada.getMorphologicalTags()
def getNodeTagset(self):
''' Given a Node, extract the tagset '''
return getSLP1Tagset(self.getMorphologicalTags())
def deleteTags(self, t):
self._pada.tags[1].difference_update(t)
self._cache_str()
return self
def setTags(self, t):
self._pada.tags[1].update(t)
self._cache_str()
return t
def node_is_a(self, st):
''' Check if node matches a particular tag or any of a set of tags '''
if isinstance(st, str):
return st in self.getNodeTagset()
elif isinstance(st, set):
return not st.isdisjoint(self.getNodeTagset())
else:
logger.error(f"node_is_a: expecting str or set, got {st} of type {type(st)}")
def get_vibhakti(self):
''' Get Node vacana '''
return self.getNodeTagset().intersection(vibhaktis)
def get_lakara(self):
''' Get Node vacana '''
return self.getNodeTagset().intersection(lakaras)
def get_vacana(self):
''' Get Node vacana '''
return self.getNodeTagset().intersection(vacanas)
def get_linga(self):
''' Get Node linga '''
return self.getNodeTagset().intersection(lingas)
def get_purusha(self):
''' Get Node puruza '''
return self.getNodeTagset().intersection(puruzas)
def __str__(self):
return self._str
def __repr__(self):
return str(self)
[docs]class VakyaParse(object):
def __init__(self, nodepair):
''' Initializes a partial parse with a node pair (or []) '''
# DisjointSet (as in Kruskal)
self.connections = DisjointSet()
# "Extinguished" nodes - nodes from partitions whose representatives
# Have been added to the forest/ST already
self.extinguished = set()
# Nodes in the forest/ST
self.activenodes = set()
if nodepair is not None:
self._populate(nodepair)
else:
# Edges in the forest/ST
self.edges = set()
def __repr__(self):
return str(self)
def __str__(self):
return str([(u.pada.canonical(), v.pada.canonical()) for (u, v) in self.edges])
def _populate(self, nodepair):
# Add first edge to forest
self.edges = set([(nodepair[0], nodepair[1])])
for n in nodepair:
self.activate_and_extinguish_alternatives(n)
self.connections.union(nodepair[0], nodepair[1])
[docs] def activate_and_extinguish_alternatives(self, node):
''' Make node active, extinguish other nodes in its partition '''
self.activenodes.add(node)
self.extinguished.add(node.index)
[docs] def is_extinguished(self, node):
''' Is a node extinguished '''
return (node.index in self.extinguished) and \
(node not in self.activenodes)
[docs] def is_safe(self, pred, node):
''' Checks if a partial parse is compatible with a given node and predecessor pair '''
if self.is_extinguished(pred) or self.is_extinguished(node):
r = False
elif (pred in self.activenodes) and (node in self.activenodes):
r = not self.connections.connected(pred, node)
else:
r = True
return r
[docs] def extend(self, pred, node):
''' Extend current parse with edge from pred to node '''
if not self.activenodes:
logger.debug(f"Populating {self.edges} with {(pred,node)}")
self._populate((pred, node))
else:
logger.debug(f"Extending {self.edges} with {(pred,node)}")
if pred not in self.activenodes:
self.activate_and_extinguish_alternatives(pred)
if node not in self.activenodes:
self.activate_and_extinguish_alternatives(node)
self.edges.add((pred, node))
self.connections.union(pred, node)
logger.debug(f"Edges {self.edges}")
[docs] def can_merge(self, other, length):
''' Can we merge two VakyaParses '''
# Proper length
if (len(other.edges) + len(self.edges)) < length:
return False
# No extinguished nodes
for x in other.activenodes:
if self.is_extinguished(x):
return False
conn = self.connections.copy()
# No cycles
for (u, v) in other.edges:
if conn.connected(u, v):
return False
else:
conn.union(u, v)
return True
[docs] def merge_f(self, other):
''' Merge two VakyaParses: Fast method '''
t = self.copy()
logger.debug("Merging")
t.extinguished.update(other.extinguished)
t.activenodes.update(other.activenodes)
t.edges.update(other.edges)
for (u, v) in other.edges:
t.connections.union(u, v)
return t
[docs] def merge_s(self, other, length):
''' Merge two VakyaParses: Slow method '''
# Proper length
if (len(other.edges) + len(self.edges)) < length:
return False
t = self.copy()
for (u, v) in other.edges:
if t.is_safe(u, v):
t.extend(u, v)
else:
return False
return t
def __len__(self):
return len(self.edges)
[docs] def copy(self):
''' Return a one level deep copy - in between a shallow and a fully deep copy '''
t = VakyaParse(None)
t.activenodes = copy(self.activenodes)
t.edges = copy(self.edges)
t.extinguished = copy(self.extinguished)
t.connections = self.connections.copy()
return t
def match_purusha_vacana(d, n):
''' Check vacana/puruza compatibility for a Dhatu d and node n '''
n_base = _get_base(n)
if n_base == 'asmad':
n_purusha = set([puruzas[2]])
elif n_base == 'yuzmad':
n_purusha = set([puruzas[1]])
else:
n_purusha = set([puruzas[0]])
return (d.get_vacana() == n.get_vacana()) and (d.get_purusha() == n_purusha)
def match_linga_vacana(n1, n2):
''' Check linga/puruza compatibility for two nodes '''
return (n1.get_vacana() == n2.get_vacana()) and \
(n1.get_linga() == n2.get_linga())
def match_linga_vacana_vibhakti(n1, n2):
return (n1.get_vacana() == n2.get_vacana()) and \
(n1.get_linga() == n2.get_linga()) and \
(n1.get_vibhakti() == n2.get_vibhakti())
def check_sambodhya(d, n):
''' Check sambodhya compatibility for dhatu d and node n '''
return (d.get_vacana() == n.get_vacana()) and \
(d.get_purusha() == set([puruzas[1]]))
def jedge(pred, node, label, strict_io=False):
return (node.pada.canonical(strict_io=strict_io),
jtag(node.getMorphologicalTags(), strict_io),
SanskritImmutableString(label, encoding=SLP1).canonical(strict_io=strict_io),
pred.pada.canonical(strict_io=strict_io))
def jnode(node, strict_io=False):
""" Helper to translate parse node into serializable format"""
return (node.pada.canonical(strict_io=strict_io),
jtag(node.getMorphologicalTags(), strict_io), "", "")
def jtag(tag, strict_io=False):
""" Helper to translate tag to serializable format"""
return (tag[0].canonical(strict_io=strict_io), [t.canonical(strict_io=strict_io) for t in list(tag[1])])
def _non_projective(u, v, w, x):
""" Checks if an edge pair is non-projective """
mnu = min(u, v)
mxu = max(u, v)
mnw = min(w, x)
mxw = max(w, x)
if mnu < mnw:
return (mxu < mxw) and (mxu > mnw)
elif mxu > mxw:
return (mnu > mnw) and (mnu < mxw)
def _is_same_partition(n1, n2):
''' Are the two nodes n1 and n2 are in the same partition in psets '''
return n1.index == n2.index
def _get_base(n):
return n.getMorphologicalTags()[0]
def _order_parses(pu):
'''
Order a set of parses by weight.
'''
# Sigma abs(n1-n2)
def _parse_cost(parse):
w = 0
# Multigraph (keys=False)
for (u, v, l) in parse.edges(data='label'):
if l in edge_cost_const:
_w = edge_cost[l]
else:
# Abs Edge length times edge_type cost
_w = abs(u.index - v.index) * edge_cost[l]
if u.node_is_a(lakaras):
# Lakaras are preferred
_w = 0.9 * _w
w = w + _w
return round(w, 3)
t = sorted(pu, key=_parse_cost)
return t, [_parse_cost(te) for te in t]
# Check a parse for validity
def _check_parse(parse):
r = True
smbds = samplabels
count = defaultdict(lambda: defaultdict(int))
edges = {}
toedge = defaultdict(int)
fromv = defaultdict(int)
tov = defaultdict(int)
sk = defaultdict(int)
vsmbd = {}
conj = defaultdict(lambda: {"from": 0, "to": 0})
sckeys = set(sentence_conjunctions.keys())
# Multigraph (keys=False)
for (u, v, l) in parse.edges(data='label'):
if l in karakas:
count[u][l] = count[u][l]+1
toedge[v] = toedge[v]+1
if l in ['sambadDa-'+x for x in karakas]:
ll = l[9:] # Strip off sambadDa-
ll = l.replace('sambadDa-', '') # Strip off sambadDa-
# Cant have x and sambadDa-x to the same dhAturUpa
# Note that sambadDa-x arcs are reversed
count[v][ll] = count[v][ll]+1
toedge[v] = toedge[v]+1
if l in 'viSezaRam':
fromv[u] = fromv[u] + 1
tov[v] = tov[v] + 1
if l in projlabels: # Labels with sannidhi expectation
edges[(u.index, v.index)] = 1
if l in smbds:
sk[u] = sk[u] + 1
if l in 'vAkyasambanDaH':
vsmbd[u.index] = v.index
vsmbd[v.index] = u.index
# Conjuction nodes must have in and out edges
if _get_base(u) in sckeys:
conj[u]["from"] = conj[u]["from"] + 1
if _get_base(v) in sckeys:
conj[v]["to"] = conj[v]["to"] + 1
for u in count: # Dhatu
for k in count[u]: # Each karaka should count only once
if count[u][k] > 1:
logger.debug(f"Count for {u} {k} is {count[u][k]} - violates global constraint")
return False
for (ui, vi) in edges:
for (wi, xi) in edges:
if _non_projective(ui, vi, wi, xi): # Non-projective
logger.debug(f"Sannidhi violation {ui} - {vi} : {wi} - {xi}")
return False
for v in toedge:
if toedge[v] > 1:
logger.debug(f"Toedges for {v} is {toedge[v]} - violates global constraint")
return False
for u in sk: # Sambaddhakaraka = only one allowed from node
if sk[u] > 1:
logger.debug(f"Sambaddha karaka edges for {u} is {sk[u]} - violates global constraint")
return False
for v in tov:
if v in fromv:
logger.debug(f"Viseshana has visheshana {v} - violates global constraint")
return False
# Vakyasambabdha nodes - yadi/tarhi yatra/tatra etc cannot have links
# beyond their partner
# Multigraph (keys=False)
for (u, v, l) in parse.edges(data='label'):
if u.index in vsmbd:
if ((vsmbd[u.index] > u.index) and (v.index > vsmbd[u.index])) or \
((vsmbd[u.index] < u.index) and (v.index < vsmbd[u.index])):
logger.info(f"Sannidhi violation for vAkyasambadDa {u.index} - {v.index} : {u} {vsmbd[u.index]}")
return False
if v.index in vsmbd:
if ((vsmbd[v.index] > v.index) and (u.index > vsmbd[v.index])) or \
((vsmbd[v.index] < v.index) and (u.index < vsmbd[v.index])):
logger.info(f"Sannidhi violation for vAkyasambadDa {u.index} - {v.index} : {v} {vsmbd[v.index]}")
return False
if (u.index not in vsmbd) and l[:9] == 'sambadDa-':
logger.info(f"SambadDa edge from non vAkyasambanDa node {u.index} - {v.index}: l")
return False
# Conjunctions have to have one to and from edge
for u in conj:
if (conj[u]["from"] != 1) or (conj[u]["to"] != 1):
logger.debug(f"Samyojaka violation for {u.index} {conj[u]}")
return False
return r
# Multigraph
def multiedgesets(p, G):
medges = []
for e in p.edges:
medges.append([(e[0], e[1], k) for k in G[e[0]][e[1]]])
return [list(x) for x in product(*medges)]