# -*- coding: utf-8 -*-
"""
@author: Karthik Madathil (github: @kmadathil)
"""
from os.path import dirname, basename, splitext, join
from argparse import ArgumentParser
import logging
from sanskrit_parser import Parser
from sanskrit_parser.base.sanskrit_base import SCHEMES, SanskritNormalizedString
from sanskrit_parser.base.sanskrit_base import outputctx
from sanskrit_parser.parser.sandhi_analyzer import LexicalSandhiAnalyzer
from sanskrit_parser import enable_file_logger, enable_console_logger
import csv
logger = logging.getLogger(__name__)
[docs]def getVakyaArgs(argv=None):
"""
Argparse routine.
Returns args variable
"""
# Parser Setup
parser = ArgumentParser(description='Vakya Analyzer')
# String to encode
parser.add_argument('data', nargs="?", type=str, default="astyuttarasyAMdishidevatAtmA")
# Input Encoding (autodetect by default)
parser.add_argument('--input-encoding', type=str, default=None)
# Need a lakara
parser.add_argument('--need-lakara', action='store_true')
parser.add_argument('--max-paths', type=int, default=1)
parser.add_argument('--split-above', type=int, default=5)
parser.add_argument('--lexical-lookup', type=str, default="combined")
parser.add_argument('--pre-segmented', action='store_true',
help="Expect pre-segmented space separated string (Usually for test only)")
parser.add_argument('--strict-io', action='store_true',
help="Do not modify the input/output string to match conventions", default=False)
parser.add_argument('--score', dest="score", action='store_true',
help="Use the lexical scorer to score the splits and reorder them")
parser.add_argument('--slow-merge', dest='fast_merge', action='store_false', help="Development Only: use if you see issues in divide and conquer")
parser.add_argument('--dot-file', type=str, default=None, help='Dotfile')
parser.add_argument('--conll', action="store_true", help="display CONLL")
parser.add_argument('--conll-file', type=str, default=None, help='CONLL output file')
parser.add_argument('--conll-append', action="store_true", help="append to CONLL file rather than recreate")
parser.add_argument('--min-cost', action="store_true", help="Only return min-cost parses")
return parser.parse_args(argv)
[docs]def vakya(argv=None):
args = getVakyaArgs(argv)
if args.strict_io:
print("Interpreting input strictly")
else:
print("Interpreting input loosely (strict_io set to false)")
logger.info(f"Input String: {args.data}")
parser = Parser(input_encoding=args.input_encoding,
strict_io=args.strict_io,
output_encoding="SLP1",
replace_ending_visarga=None,
score=args.score,
split_above=args.split_above,
lexical_lookup=args.lexical_lookup)
logger.debug('Splits:')
for si, split in enumerate(parser.split(args.data,
limit=args.max_paths,
dot_file=args.dot_file,
pre_segmented=args.pre_segmented)):
logger.info(f'Sandhi Split: {split}')
logger.info(f'Min cost only {args.min_cost}')
for pi, parse in enumerate(split.parse(limit=999,
min_cost_only=args.min_cost)):
logger.debug(f'Parse {pi}')
logger.debug(f'{parse}')
print(f'Parse {pi} : (Cost = {parse.cost})')
if args.conll:
for line in parse.to_conll():
print(line)
else:
print(f'{parse}')
if args.conll_file is not None:
path = args.conll_file
d = dirname(path)
be = basename(path)
b, e = splitext(be)
conllbase = join(d, b + f"_split{si}_parse{pi}" + e)
if args.conll_append:
tfile = open(conllbase, "a")
else:
tfile = open(conllbase, "w")
twriter = csv.writer(tfile, delimiter='\t')
for line in parse.to_conll():
twriter.writerow(line)
twriter.writerow([])
tfile.close()
# Write dot files
if args.dot_file is not None:
path = args.dot_file
d = dirname(path)
be = basename(path)
b, e = splitext(be)
splitbase = join(d, b + f"_split{si}" + e)
split.write_dot(splitbase)
return None
[docs]def getSandhiArgs(argv=None):
"""
Argparse routine.
Returns args variable
"""
# Parser Setup
parser = ArgumentParser(description='Sandhi Analyzer')
# String to encode
parser.add_argument('data', nargs="?", type=str, default="astyuttarasyAMdishidevatAtmA")
# Input Encoding (autodetect by default)
parser.add_argument('--input-encoding', type=str, default=None)
parser.add_argument('--max-paths', type=int, default=10)
parser.add_argument('--lexical-lookup', type=str, default="combined")
parser.add_argument('--pre-segmented', action='store_true',
help="Expect pre-segmented space separated string (Usually for test only)")
parser.add_argument('--strict-io', action='store_true',
help="Do not modify the input/output string to match conventions", default=False)
parser.add_argument('--no-score', dest="score", action='store_false',
help="Use the lexical scorer to score the splits and reorder them")
parser.add_argument('--dot-file', type=str, default=None, help='Dotfile')
return parser.parse_args(argv)
[docs]def sandhi(argv=None):
args = getSandhiArgs(argv)
if args.strict_io:
print("Interpreting input strictly")
else:
print("Interpreting input loosely (strict_io set to false)")
logger.info(f"Input String: {args.data}")
parser = Parser(input_encoding=args.input_encoding,
strict_io=args.strict_io,
output_encoding="SLP1",
replace_ending_visarga=None,
score=args.score,
lexical_lookup=args.lexical_lookup)
logger.debug('Splits:')
for si, split in enumerate(parser.split(args.data,
limit=args.max_paths,
dot_file=args.dot_file,
pre_segmented=args.pre_segmented)):
logger.info(f'Split: {split}')
return None
[docs]def cmd_line():
""" Command Line Wrapper Function
"""
parser = ArgumentParser(description='Sanskrit Parser',
usage='%(prog)s [sandhi|vakya|tag] [options] \n\n Use %(prog)s [sandhi|vakya|tag] --help for further options',
add_help=False)
parser.add_argument('command', help='Subcommand to run',
choices=["sandhi", "vakya", "tags"])
parser.add_argument('--debug', action='store_true')
# parse_args defaults to [1:] for args, but you need to
# exclude the rest of the args too, or validation will fail
args, rest = parser.parse_known_args()
# Logging
enable_console_logger(level=logging.INFO)
if args.debug:
enable_file_logger(level=logging.DEBUG)
# else:
# enable_file_logger(level=logging.INFO)
if not hasattr(args, 'command'):
print('Unrecognized command')
parser.print_help()
exit(1)
# use dispatch pattern to invoke method with same name
eval(getattr(args, 'command')+"(rest)")