#! /usr/bin/python
# -*- coding: latin-1 -*-
# $Id$
"""
Copyright (C) 2007, 2008 by Martin Thorsen Ranang
This file is part of InTeX.
InTeX is free software: you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation, either version 3 of the License, or (at your
option) any later version.
InTeX is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
You should have received a copy of the GNU General Public License
along with InTeX. If not, see .
"""
__revision__ = "$Rev$"
__author__ = "Martin Thorsen Ranang "
import re
def cartesian(*sequences):
"""Returns the cartesian product of the SEQUENCES.
"""
if len(sequences) == 0:
yield []
else:
head, tail = sequences[:-1], sequences[-1]
tail = list(tail)
for x in cartesian(*head):
for y in tail:
yield x + [y]
class UnexpectedClosingError(ValueError):
"""Error indicating that an ParenParser detected an unexpected
closing.
"""
pass
class ParenParser(object):
ESCAPE_TOKEN = '\\'
__default_paren_pair = {
'(': ')',
'{': '}',
'[': ']',
}
def __init__(self, paren_pair=__default_paren_pair):
self.__paren_pair = paren_pair
self.__closings = set(self.__paren_pair.values())
def __reset(self):
self.__consecutive_escapes = 0 # The number of consecutive escapes.
self.__opened_scopes = [] # A stack of opened scopes.
self.__expected_closing = None
def __update_expected_closing(self):
if self.__opened_scopes:
most_recent_opening = self.__opened_scopes[-1][0]
self.__expected_closing = self.__paren_pair[most_recent_opening]
else:
self.__expected_closing = None
def get_scope_spans(self, string, start_index=0):
self.__reset()
for i in xrange(start_index, len(string)):
token = string[i]
if token == self.ESCAPE_TOKEN:
self.__consecutive_escapes += 1
continue
# If the current TOKEN is not escaped; that is, the
# __CONSECUTIVE_ESCAPES is zero or odd (bit-wise "& 1")...
if not (self.__consecutive_escapes & 1):
# If the TOKEN is an expected closing...
if token == self.__expected_closing:
# Yield the start and end index for the closing
# scope.
yield self.__opened_scopes.pop()[1], (i + 1)
self.__update_expected_closing()
# If TOKEN is an unexpected closing...
elif token in self.__closings:
j = self.__opened_scopes[-1][1]
reason = "Received unexpected closing '%s' at " \
"input position %d. " \
"Input (from position %d):\n%s\n" \
"Expected closing was '%s'." \
% (token, i, j, string[j:(i + 1)],
self.__expected_closing)
raise UnexpectedClosingError, reason
# If TOKEN is a legal opening token...
elif (token in self.__paren_pair):
self.__opened_scopes.append((token, i))
self.__update_expected_closing()
# Reset the escape-token counter.
self.__consecutive_escapes = 0
@staticmethod
def filter_outer_scopes(scopes):
"""Removes scopes that appear inside other scopes. Hence, only
the outermost scopes will be returned.
Example:
>>> parenparser.remove_inner_scopes([(2, 5), (3, 4), (6, 12),
... (7, 11), (8, 10)])
[(2, 5), (6, 12)]
"""
scopes = sorted(scopes)
keep = []
for n, (i, j) in enumerate(scopes):
if n == 0:
pass # Must define an outer scope.
elif j < keep[-1][-1]:
continue
keep.append((i, j))
return keep
def split(self, string, separator=None, maxsplit=None):
"""Return a list of the words in the string STRING, using
SEPARATOR as the delimiter string, but scopes defined by
parenthetical tokens are not split. If MAXSPLIT is given, at
most MAXSPLIT splits are done. If SEPARATOR is not specified
or is None, any whitespace string is a separator.
Please note: The MAXSPLIT argument is not honored yet.
"""
# Make sure only outermost scopes are considered.
indices = self.filter_outer_scopes(self.get_scope_spans(string))
# If no parenthetical scopes were detected, there are no
# scopes to honor.
if not indices:
return string.split(separator)
# The following is added to avoid any special-case handling
# after the for-loop below.
if indices[-1][-1] < len(string):
indices.append((-1, None))
k = None
result = []
for i, j in indices:
parts = filter(''.__ne__, string[k:i].split(separator)) \
+ [string[i:j]]
first_part = parts[0]
if result and (k is not None) and (first_part[0] == string[k]):
result[-1] += first_part
else:
result.append(first_part)
if len(parts) > 1:
result.extend(parts[1:-1])
last_part = parts[-1]
if result[-1][-1] == string[i - 1]:
result[-1] += last_part
else:
result.append(last_part)
k = j
return result
def strip(self, string, valid_left_sides=''.join(__default_paren_pair)):
if (len(string) > 1) \
and (string[0] in valid_left_sides) \
and (string[-1] in self.__default_paren_pair[string[0]]):
return string[1:-1]
else:
return string
def main():
"""Module mainline (for standalone execution).
"""
p = ParenParser()
for string in [
'',
'a b c',
'()',
'(abc)',
'(abc) def',
'abc(def)ghi (jkl) mno(pqr) stu',
'(abc(def)ghi (jkl) mno(pqr) stu)',
'()abc(def)ghi (jkl) mno(pqr) stu',
'abc(def)ghi (jkl) mno(pqr) stu()',
'abc(def)ghi (jkl) mno(pqr) stu (vwx) ()',
]:
print '("%s",\n %s),' % (string, p.split(string))
if __name__ == "__main__":
main()