#! /usr/bin/python
# -*- coding: utf-8 -*-
# $Id$
"""
Copyright (C) 2007, 2008 by Martin Thorsen Ranang
This file is part of InTeX.
InTeX is free software: you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation, either version 3 of the License, or (at your
option) any later version.
InTeX is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
You should have received a copy of the GNU General Public License
along with InTeX. If not, see .
"""
__author__ = "Martin Thorsen Ranang "
__revision__ = "$Rev$"
__version__ = "@VERSION@"
from collections import defaultdict
from cStringIO import StringIO
import logging
import re
import sys
from config import (
FIELD_SEPARATORS,
INTEX_DEFAULT_INDEX,
TOKEN_COMMENT,
TOKEN_ENTRY_META_INFO,
)
from acronym_entry import AcronymEntry
from concept_entry import ConceptEntry
from person_entry import PersonEntry
from stack import Stack
from utils import flatten, escape_aware_rsplit
import entry
class _IndexError(Exception):
"""Base class for errors in the index module.
"""
class IndexSyntaxError(_IndexError, SyntaxError):
"""Error caused by invalid syntax in the input .itx file.
"""
class Index(list):
_intex_re = re.compile('\\\@writefile\{%s\}' \
'\{\\\indexentry\{(?P.*)\}\{(?P\w+)\}\}' \
% (INTEX_DEFAULT_INDEX))
_aux_input_re = re.compile('\\\@input\{(?P.*\.aux)\}')
_concept_types = ['ACRONYMS', 'PEOPLE', 'CONCEPTS']
_index_attrbiutes = {
'name': None,
}
_re_macros = {
'FIELD': '''
[^%(COMMENT_TOKEN)s:\s] # Non-comment and non-meta.
( # Either
# escaped field separators
\\\\[%(FIELD_SEPARATORS)s%(META_TOKEN)s]
| # or
# non-field-separators
[^%(FIELD_SEPARATORS)s%(META_TOKEN)s]
)+ # repeated.
''',
'FIELD_SEPARATORS': FIELD_SEPARATORS,
'COMMENT_TOKEN': TOKEN_COMMENT,
'META_TOKEN': TOKEN_ENTRY_META_INFO,
'ALIAS_POINTER': '-->',
}
# Instantiate/format the FIELD macro value by supplying the other
# (i.e., FIELD_SEPARATORS, COMMENT_TOKEN, and META_TOKEN) mappings
# in _RE_MACROS.
_re_macros['FIELD'] = _re_macros['FIELD'] % _re_macros
# A regulare expression to match aliasing index entries.
_alias_re = re.compile('''
^ # Starts with
(?P.+) # the entry (including whitespace),
\s*?%(ALIAS_POINTER)s\s* # the alias indicator,
(?P.+) # the entry being aliased,
$ # at the end.
''' % _re_macros, re.VERBOSE)
# Regular expressions, one pattern matcher for each context:
_concept_re = re.compile('''
^ # Starts with
(?P\s+)? # (possible) indentation,
(?P%(FIELD)s)? # an entry,
[%(FIELD_SEPARATORS)s]* # a separator,
(?P%(FIELD)s)? # the full form,
[%(FIELD_SEPARATORS)s]* # another separator,
(?P%(META_TOKEN)s.+)? # meta information
$ # at the end.
''' % _re_macros, re.VERBOSE)
_acronym_re = re.compile('''
^ # Starts with
(?P\s+)? # (possible) indentation,
(?P%(FIELD)s)? # an entry,
[%(FIELD_SEPARATORS)s]* # a separator,
(?P%(FIELD)s)? # the full form,
[%(FIELD_SEPARATORS)s]* # another separator,
(?P%(FIELD)s)? # the full form,
[%(FIELD_SEPARATORS)s]* # another separator,
(?P%(META_TOKEN)s.+)? # meta information
$ # at the end.
''' % _re_macros, re.VERBOSE)
_person_re = re.compile('''
^ # Starts with
(?P\s+)? # (possible) indentation,
(?P%(FIELD)s)? # an entry,
[%(FIELD_SEPARATORS)s]* # a separator,
(?P%(FIELD)s)? # the name,
[%(FIELD_SEPARATORS)s]* # another separator
(?P%(META_TOKEN)s.+)? # meta information
$ # at the end.
''' % _re_macros, re.VERBOSE)
# A map between context names and the match patterns:
_context_matcher = {
'ACRONYMS': _acronym_re,
'CONCEPTS': _concept_re,
'PEOPLE': _person_re,
}
_context_class = {
'ACRONYMS': AcronymEntry,
'CONCEPTS': ConceptEntry,
'PEOPLE': PersonEntry,
}
# Add a macro definition to the values available when constructing
# the regular expressions below.
_re_macros['CONCEPT_TYPES'] = '|'.join(_context_matcher.keys())
# A pattern to match meta directives:
_meta_directive_re = re.compile('''
^ # Starts with
%(COMMENT_TOKEN)s # the comment token,
\s* # possibly some whitespace,
( # then either
(?P\w+)\s* # an attribute
= # sat to
\s*(?P\w+) # some value,
| # or
(?P\*(%(CONCEPT_TYPES)s)\*) # a context switch
) # followed by
\s* # possibly trailing whitespace
$ # at the end.
''' % _re_macros, re.VERBOSE)
# A pattern to match comment-only lines.
_pure_comment_re = re.compile('''
^ # Starts with
\s* # possibly some whitespace
%(COMMENT_TOKEN)s # and the comment token,
.* # followed by anything.
''' % _re_macros, re.VERBOSE)
# A list used to instantiate the context dependent parser.
_matchers = [
_meta_directive_re, # For matching meta directives.
_pure_comment_re, # For matching pure comments.
None, # The current context will be placed here.
]
def __init__(self, filename=None, index_name='default'):
"""The constructor.
"""
list.__init__(self) # Initialize the base class.
self._name = index_name # FIXME: The index's name for use in/by ...
self._current_matchers = list(Index._matchers)
self._indentation_level = {
'': 0,
}
self._elements = Stack() # A stack of elements used when parsing.
#self.__entries = [] # A list of all the entries in the index.
# Accessors for the 'name' property (_-prefixed to force access
# through the property):
def _get_name(self):
return self._name
def _set_name(self, name):
self._name = name
name = property(_get_name, _set_name, None, 'The name of the index.')
def handle_meta_directive(self, attribute=None, value=None, context=None,
alias=None):
if attribute:
# Set an attribute describing this index (e.g., its name).
logging.info('Setting the index\'s %s=%s.', attribute, repr(value))
setattr(self, attribute, value)
elif context:
self._context = context[1:-1] # Remove pre and post '*'s.
logging.info('Switching context to: "%s"', self._context, )
self._matchers[-1] = self._context_matcher[self._context]
self._entry_class = self._context_class[self._context]
def handle_comment(self, alias=None):
"""Do nothing. (Yes, seriously.)
"""
pass
def _get_indentation_level(self, indent):
if indent == None:
indent = ''
# Make sure that indentation by tabs are expanded.
indent = indent.expandtabs()
if indent not in self._indentation_level:
if len(indent) < max(len(key) for key in self._indentation_level):
# The indentation levels should be monotonically
# increasing. The first time an indentation level is
# used, it is also defined and available for the rest
# of the session.
raise IndentationError('On line %d in file "%s:\n%s' \
% ((self._line_num + 1),
self._filename,
self._current_line))
else:
self._indentation_level[indent] = len(self._indentation_level)
return self._indentation_level[indent]
def _get_current_parent(self):
if self._elements:
return self._elements[-1].identity
else:
None
def handle_entry(self, indent=None, **rest):
indent_level = self._get_indentation_level(indent)
# Reduce the _ELEMENTS stack until its length equals the
# current INDENT_LEVEL.
while indent_level < len(self._elements):
self._elements.pop()
# Push the new entry onto the stack. The current _ENTRY_CLASS
# is set by the different meta directives (see
# handle_meta_directive()).
self._elements.push(self._entry_class(index=self,
parent=self._get_current_parent(),
indent_level=indent_level,
**rest))
_match_handler = {
_meta_directive_re: handle_meta_directive,
_pure_comment_re: handle_comment,
_concept_re: handle_entry,
_acronym_re: handle_entry,
_person_re: handle_entry,
}
@staticmethod
def syntax_error(filename, line_number, message):
logging.error('Syntax error on line %d in file "%s": %s',
line_number, filename, message)
sys.exit(1)
@classmethod
def from_file(cls, filename):
self = cls()
self._filename = filename
logging.info('Reading index definitions from "%s".', filename)
stream = open(filename, 'r')
for self._line_num, line in enumerate(stream):
# Ignore whitespace-only lines.
if line.isspace():
continue
# Keep a copy of the original line (for error messages,
# etc.)
self._current_line = line
# Remove trailing white-space.
line = line.rstrip()
# Find out whether the entry is an alias for another
# entry.
for match in self._alias_re.finditer(line):
line, alias = map(match.group, ['entry', 'alias'])
break
else:
alias = None
# Parse the line trying different matchers. Quit trying
# after the first applicable matcher is used.
for matcher in self._matchers:
if matcher is None:
self.syntax_error(filename, (self._line_num + 1),
"Encountered an entry, but no " \
"current entry type " \
"('*ACRONYMS*', '*CONCEPTS*', or " \
"'*PERSONS*') has been defined.")
for match in matcher.finditer(line):
# Call the appropriate handler, given the current
# context.
try:
self._match_handler[matcher](self, alias=alias,
**match.groupdict())
except entry.MissingAcronymExpansionError, e:
self.syntax_error(filename, (self._line_num + 1),
'Missing full-form expansion for ' \
'acronym definition of "%s".' \
% (e.message, ))
except:
raise
break # To avoid the else clause.
else:
continue # The matcher didn't apply, so
# let's try the next one.
break # Skip the remaining matchers.
stream.close() # Explicitly close the input stream.
return self
def __str__(self):
stream = StringIO()
previous_type = None
for entry in self:
if type(entry) != previous_type:
print >> stream, entry.get_plain_header()
print >> stream, entry
previous_type = type(entry)
string = stream.getvalue()
stream.close()
return string
def generate_reference_index(self):
references = defaultdict(set)
for entry in self:
for inflection, phrase in entry.reference_short.iteritems():
if phrase:
references[phrase].add(entry)
ambiguous_short_references = [
entries
for phrase, entries in sorted(references.iteritems())
if len(entries) > 1]
for entries in ambiguous_short_references:
logging.warn('The short-form references for the entries\n'
'%s\nare the same (%s) and hence ambiguous. '
'Hence, the entries must be referred to using '
'the full form references.',
'\nand\n'.join(str(entry) for entry in entries),
list(entries)[0].reference_short['singular'])
ambiguous_short_references = set(flatten(ambiguous_short_references))
references = dict()
for entry in self:
for inflection, phrase in entry.reference.iteritems():
if phrase in references:
logging.error('Duplicate full-form reference "%s" ' \
'detected. Please correct the file "%s".', \
phrase, self._filename)
sys.exit(1)
references[phrase] = (entry, inflection)
if entry not in ambiguous_short_references:
for inflection, phrase in entry.reference_short.iteritems():
references[phrase] = (entry, inflection)
use_short_reference = True
else:
use_short_reference = False
entry.use_short_reference = use_short_reference
self.references = references
def get_auxiliary_entries(self, filename):
for i, line in enumerate(open(filename, 'r')):
line_number = (i + 1)
# Handle recursive \@include-statements within .aux files.
match = self._aux_input_re.match(line.rstrip())
if match:
input_filename = match.group('filename')
logging.info('Will include "%s"', (input_filename, ))
for result in self.get_auxiliary_entries(input_filename):
yield result
continue
match = self._intex_re.match(line.rstrip())
if not match:
yield (filename, line_number, False, line)
continue
key, page = match.groups()
# Keep track of any page-number typesetting hints.
parts = escape_aware_rsplit(key, '|', 1)
if len(parts) > 1:
key, typeset_page_number = parts
typeset_page_number = '|' + typeset_page_number
else:
typeset_page_number = ''
# Do _not_ try to convert page into an integer, it may
# occurr as roman numerals.
yield (filename, line_number, True,
(key, page, typeset_page_number))
def interpret_auxiliary(self, auxiliary_filename, internal_file,
index_file):
not_found = set()
already_output = set()
already_handled = set()
for filename, line_number, is_concept, data \
in self.get_auxiliary_entries(auxiliary_filename):
if not is_concept:
logging.debug('Ignoring non-concept: "%s" on line %d in "%s".',
data.rstrip(), line_number, filename)
continue
key, page, typeset_page_number = data
logging.debug('Handling reference "%s" on page %s.', key, page)
if key not in self.references:
if key.lower() in self.references:
key = key.lower()
logging.debug('lowered key: %s', key)
else:
not_found.add((key, page))
continue
concept, inflection = self.references[key]
logging.debug('Reference expanded to %s (inflection=%s)',
repr(concept), inflection)
for line in concept.generate_index_entries(page,
typeset_page_number):
if line not in already_output:
print >> index_file, line
already_output.add(line)
if key not in already_handled:
for line in concept.generate_internal_macros(inflection):
if line not in already_output:
print >> internal_file, line
already_output.add(line)
already_handled.add(key)
return not_found
def main():
"""Module mainline (for standalone execution).
"""
pass
if __name__ == "__main__":
main()