Source code for pdbx.reader

##
# File: PdbxReader.py
# Date: 2012-01-09 Jdw Adapted from PdbxParser
#
# Updates:
#
# 2012-01-09 - (jdw) Separate reader and writer classes.
#
# 2012-09-02 - (jdw) Revise tokenizer to better handle embedded quoting.
#
##
"""PDBx/mmCIF dictionary and data file parser.

.. note::

   Acknowledgements:

   The tokenizer used in this module is modeled after the clever parser
   design used in the PyMMLIB package.

   PyMMLib Development Group:

   Authors: Ethan Merritt: merritt@u.washington.edu, Jay Painter: jay.painter@gmail.com

   See: http://pymmlib.sourceforge.net/
"""
import re
from .containers import DataCategory, DefinitionContainer, DataContainer
from .errors import PdbxSyntaxError


[docs]class PdbxReader:
    """PDBx reader for data files and dictionaries."""

    def __init__(self, input_file):
        """Initialize.

        :param file input_file: input file handle; e.g. as returned by open().
        """
        self.__current_line_number = 0
        self.__input_file = input_file
        self.__state_dict = {
            "data": "ST_DATA_CONTAINER",
            "loop": "ST_TABLE",
            "global": "ST_GLOBAL_CONTAINER",
            "save": "ST_DEFINITION",
            "stop": "ST_STOP",
        }

[docs]    def read(self, container_list):
        """Appends to the input list of definition and data containers.

        :param list container_list:  list of :class:`~pdbx.containers.ContainerBase` containers to append to.
        """
        self.__current_line_number = 0
        try:
            self.__parser(self.__tokenizer(self.__input_file), container_list)
        except StopIteration:
            self.__syntax_error("Unexpected end of file")

    def __syntax_error(self, error_text):
        """Raise a PdbxSyntaxError.

        :param str error_text:  text for exception message
        :raises pdbx.errors.PdbxSyntaxError:  exception with error text
        """
        raise PdbxSyntaxError(self.__current_line_number, error_text)

    @staticmethod
    def __get_container_name(in_word) -> str:
        """Returns the name of the data_ or save_ container.

        :param str in_word:  input word
        """
        return str(in_word[5:]).strip()

    def __get_state(self, in_word) -> tuple:
        """Identifies reserved syntax elements and assigns an associated state.

        :param str in_word:  input word
        :returns: (reserved word, state) where:

          * reserved word - is one of CIF syntax elements: data_, loop_,
          global_, save_, stop_

          * state - the parser state required to process this next section.
        """
        i = in_word.find("_")
        if i == -1:
            return None, "ST_UNKNOWN"
        try:
            reserved_word = in_word[:i].lower()
            return reserved_word, self.__state_dict[reserved_word]
        except KeyError:
            return None, "ST_UNKNOWN"

    def __parser(self, tokenizer, container_list):
        """Parser for PDBx data files and dictionaries.

        :param tokenizer: reentrant method recognizing data item names
          (_category.attribute), quoted strings (single, double and
          multi-line semi-colon delimited), and unquoted strings.
        :param list container_list: list-type container for data and
          definition objects parsed from from the input file.
          container_list is appended with data and definition objects.
        """
        # Working container - data or definition
        current_container = None
        # Working category container
        category_index = {}
        current_category = None
        current_row = None
        state = None

        # Find the first reserved word and begin capturing data.
        for (
            current_category_name,
            current_attribute_name,
            current_quoted_string,
            current_word,
        ) in tokenizer:
            if current_word is None:
                continue
            reserved_word, state = self.__get_state(current_word)
            if reserved_word is not None:
                break
        else:
            # empty file
            return

        while True:
            # Set the current state: at this point in the processing cycle we
            # are expecting a token containing # either a '_category.attribute'
            # or a reserved word.
            if current_category_name is not None:
                state = "ST_KEY_VALUE_PAIR"
            elif current_word is not None:
                reserved_word, state = self.__get_state(current_word)
            else:
                self.__syntax_error("Miscellaneous syntax error")
                return

            # Process _category.attribute value assignments
            if state == "ST_KEY_VALUE_PAIR":
                try:
                    current_category = category_index[current_category_name]
                except KeyError:
                    # A new category is encountered - create a container and
                    # add a row
                    category_index[current_category_name] = DataCategory(
                        current_category_name
                    )
                    current_category = category_index[current_category_name]
                    try:
                        current_container.append(current_category)
                    except AttributeError:
                        self.__syntax_error(
                            "Category cannot be added to data_ block"
                        )
                        return
                    current_row = []
                    current_category.append(current_row)
                else:
                    # Recover the existing row from the category
                    try:
                        current_row = current_category[0]
                    except IndexError:
                        self.__syntax_error(
                            "Internal index error accessing category data"
                        )
                        return
                # Check for duplicate attributes and add attribute to table.
                if current_attribute_name in current_category.attribute_list:
                    self.__syntax_error(
                        "Duplicate attribute encountered in category"
                    )
                    return
                else:
                    current_category.append_attribute(current_attribute_name)
                # Get the data for this attribute from the next token
                tok_category, _, current_quoted_string, current_word = next(
                    tokenizer
                )
                if tok_category is not None or (
                    current_quoted_string is None and current_word is None
                ):
                    self.__syntax_error(
                        "Missing data for item _%s.%s"
                        % (current_category_name, current_attribute_name)
                    )
                if current_word == "?":
                    current_row.append(None)
                elif current_word == ".":
                    current_row.append("")
                elif current_word is not None:
                    # Validation check token for misplaced reserved words
                    reserved_word, state = self.__get_state(current_word)
                    if reserved_word is not None:
                        self.__syntax_error(
                            "Unexpected reserved word: %s" % (reserved_word)
                        )
                    current_row.append(current_word)
                elif current_quoted_string is not None:
                    current_row.append(current_quoted_string)
                else:
                    self.__syntax_error("Missing value in item-value pair")
                try:
                    (
                        current_category_name,
                        current_attribute_name,
                        current_quoted_string,
                        current_word,
                    ) = next(tokenizer)
                except StopIteration:
                    return
                continue

            # Process a loop_ declaration and associated data
            if state == "ST_TABLE":
                # The category name in the next current_category_name,
                # current_attribute_name pair defines the name of the category
                # container.
                (
                    current_category_name,
                    current_attribute_name,
                    current_quoted_string,
                    current_word,
                ) = next(tokenizer)
                if (current_category_name is None) or (
                    current_attribute_name is None
                ):
                    self.__syntax_error(
                        "Unexpected token in loop_ declaration"
                    )
                    return
                # Check for a previous category declaration.
                if current_category_name in category_index:
                    self.__syntax_error(
                        "Duplicate category declaration in loop_"
                    )
                    return
                current_category = DataCategory(current_category_name)
                try:
                    current_container.append(current_category)
                except AttributeError:
                    self.__syntax_error(
                        "loop_ declaration outside of data_ block or save_ "
                        "frame"
                    )
                    return
                current_category.append_attribute(current_attribute_name)
                # Read the rest of the loop_ declaration
                for (
                    current_category_name,
                    current_attribute_name,
                    current_quoted_string,
                    current_word,
                ) in tokenizer:
                    if current_category_name is None:
                        break
                    if current_category_name != current_category.name:
                        self.__syntax_error(
                            "Changed category name in loop_ declaration"
                        )
                        return
                    current_category.append_attribute(current_attribute_name)
                else:
                    # formal CIF 1.1 grammar expects at least one value
                    self.__syntax_error("loop_ without values")
                # If the next token is a 'word', check it for any reserved
                # words
                if current_word is not None:
                    reserved_word, state = self.__get_state(current_word)
                    if reserved_word is not None:
                        if reserved_word == "stop":
                            return
                        else:
                            self.__syntax_error(
                                "Unexpected reserved word after loop "
                                "declaration: %s" % (reserved_word)
                            )
                # Read the table of data for this loop_
                while True:
                    current_row = []
                    current_category.append(current_row)
                    for _ in current_category.attribute_list:
                        if current_word == "?":
                            current_row.append(None)
                        elif current_word == ".":
                            current_row.append("")
                        elif current_word is not None:
                            current_row.append(current_word)
                        elif current_quoted_string is not None:
                            current_row.append(current_quoted_string)
                        try:
                            (
                                current_category_name,
                                current_attribute_name,
                                current_quoted_string,
                                current_word,
                            ) = next(tokenizer)
                        except StopIteration:
                            return
                    # loop_ data processing ends if a new _category.attribute
                    # is encountered
                    if current_category_name is not None:
                        break
                    # A reserved word is encountered
                    if current_word is not None:
                        reserved_word, state = self.__get_state(current_word)
                        if reserved_word is not None:
                            break
                continue

            if state == "ST_DEFINITION":
                # Ignore trailing unnamed saveframe delimiters e.g. 'save_'
                state_name = self.__get_container_name(current_word)
                if state_name:
                    current_container = DefinitionContainer(state_name)
                    container_list.append(current_container)
                    category_index = {}
                    current_category = None
            elif state == "ST_DATA_CONTAINER":
                data_name = self.__get_container_name(current_word)
                if not data_name:
                    data_name = "unidentified"
                current_container = DataContainer(data_name)
                container_list.append(current_container)
                category_index = {}
                current_category = None
            elif state == "ST_STOP":
                return
            elif state == "ST_GLOBAL":
                current_container = DataContainer("blank-global")
                current_container.set_global()
                container_list.append(current_container)
                category_index = {}
                current_category = None
            elif state == "ST_UNKNOWN":
                self.__syntax_error(
                    "Unrecognized syntax element: " + str(current_word)
                )
                return
            else:
                assert False, f"unhandled state {state}"

            try:
                (
                    current_category_name,
                    current_attribute_name,
                    current_quoted_string,
                    current_word,
                ) = next(tokenizer)
            except StopIteration:
                return

    def __tokenizer(self, input_file):
        """Tokenizer method for the mmCIF syntax file.

        Each return/yield from this method returns information about the next
        token in the form of a tuple with the following structure:
            (category name, attribute name, quoted strings, words w/o quotes
            or white space)
        Differentiated the regular expression to the better handle embedded
        quotes.

        :param file input_file:  file object ready for reading
        :rtype: Iterator[tuple]
        """
        # Regex definition for mmCIF syntax - semi-colon delimited strings are
        # handled outside of this regex.
        mmcif_re = re.compile(
            r"(?:"
            r"(?:_(.+?)[.](\S+))"
            "|"  # _category.attribute
            r"(?:['](.*?)(?:[']\s|[']$))"
            "|"  # single quoted strings
            r"(?:[\"](.*?)(?:[\"]\s|[\"]$))"
            "|"  # double quoted strings
            r"(?:\s*#.*$)"
            "|"  # comments (dumped)
            r"(\S+)"  # unquoted words
            r")"
        )
        file_iterator = iter(input_file)
        # Tokenizer loop begins here
        for line in file_iterator:
            self.__current_line_number += 1
            # Dump comments
            if line.startswith("#"):
                continue
            # Gobble up the entire semi-colon/multi-line-delimited string and
            # and stuff this into the string slot in the return tuple
            if line.startswith(";"):
                multiline_string = [line[1:]]
                for line in file_iterator:
                    self.__current_line_number += 1
                    if line.startswith(";"):
                        break
                    multiline_string.append(line)
                else:
                    self.__syntax_error("unterminated multi-line string")
                # remove trailing new-line that is part of the \n; delimiter
                multiline_string[-1] = multiline_string[-1].rstrip()
                yield (None, None, "".join(multiline_string), None)
                # Need to process the remainder of the current line -
                line = line[1:]

            # Apply regex to the current line consolidate the single/double
            # quoted within the quoted string category
            for match in mmcif_re.finditer(line):
                match_groups = match.groups()
                if match_groups != (None, None, None, None, None):
                    if match_groups[2] is not None:
                        quoted_string = match_groups[2]
                    elif match_groups[3] is not None:
                        quoted_string = match_groups[3]
                    else:
                        quoted_string = None
                    groups = (
                        match_groups[0],
                        match_groups[1],
                        quoted_string,
                        match_groups[4],
                    )
                    yield groups

    def __tokenizer_org(self, input_file):
        """Tokenizer method for the mmCIF syntax file.

        Each return/yield from this method returns information about the next
        token in the form of a tuple with the following structure:
            (category name, attribute name, quoted strings, words w/o quotes
            or white space)

        :param file input_file:  file object ready for reading
        :rtype: Iterator[tuple]
        """
        # Regex definition for mmCIF syntax - semi-colon delimited strings are
        # handled outside of this regex.
        mmcif_re = re.compile(
            r"(?:"
            r"(?:_(.+?)[.](\S+))"
            "|"  # _category.attribute
            r"(?:['\"](.*?)(?:['\"]\s|['\"]$))"
            "|"  # quoted strings
            r"(?:\s*#.*$)"
            "|"  # comments (dumped)
            r"(\S+)"  # unquoted words
            r")"
        )
        file_iterator = iter(input_file)
        # Tokenizer loop begins here
        while True:
            line = next(file_iterator)
            self.__current_line_number += 1
            # Dump comments
            if line.startswith("#"):
                continue
            # Gobble up the entire semi-colon/multi-line delimited string and
            # and stuff this into the string slot in the return tuple
            if line.startswith(";"):
                multiline_string = [line[1:]]
                while True:
                    line = next(file_iterator)
                    self.__current_line_number += 1
                    if line.startswith(";"):
                        break
                    multiline_string.append(line)
                # remove trailing new-line that is part of the \n; delimiter
                multiline_string[-1] = multiline_string[-1].rstrip()
                yield (None, None, "".join(multiline_string), None)
                # Need to process the remainder of the current line
                line = line[1:]
            # Apply regex to the current line
            for match in mmcif_re.finditer(line):
                groups = match.groups()
                if groups != (None, None, None, None):
                    yield groups