Source code for pdbx.reader

##
# File: PdbxReader.py
# Date: 2012-01-09 Jdw Adapted from PdbxParser
#
# Updates:
#
# 2012-01-09 - (jdw) Separate reader and writer classes.
#
# 2012-09-02 - (jdw) Revise tokenizer to better handle embedded quoting.
#
##
"""PDBx/mmCIF dictionary and data file parser.

.. note::

   Acknowledgements:

   The tokenizer used in this module is modeled after the clever parser
   design used in the PyMMLIB package.

   PyMMLib Development Group:

   Authors: Ethan Merritt: merritt@u.washington.edu, Jay Painter: jay.painter@gmail.com

   See: http://pymmlib.sourceforge.net/
"""
import re
from .containers import DataCategory, DefinitionContainer, DataContainer
from .errors import PdbxSyntaxError


[docs]class PdbxReader: """PDBx reader for data files and dictionaries.""" def __init__(self, input_file): """Initialize. :param file input_file: input file handle; e.g. as returned by open(). """ self.__current_line_number = 0 self.__input_file = input_file self.__state_dict = { "data": "ST_DATA_CONTAINER", "loop": "ST_TABLE", "global": "ST_GLOBAL_CONTAINER", "save": "ST_DEFINITION", "stop": "ST_STOP", }
[docs] def read(self, container_list): """Appends to the input list of definition and data containers. :param list container_list: list of :class:`~pdbx.containers.ContainerBase` containers to append to. """ self.__current_line_number = 0 try: self.__parser(self.__tokenizer(self.__input_file), container_list) except StopIteration: self.__syntax_error("Unexpected end of file")
def __syntax_error(self, error_text): """Raise a PdbxSyntaxError. :param str error_text: text for exception message :raises pdbx.errors.PdbxSyntaxError: exception with error text """ raise PdbxSyntaxError(self.__current_line_number, error_text) @staticmethod def __get_container_name(in_word) -> str: """Returns the name of the data_ or save_ container. :param str in_word: input word """ return str(in_word[5:]).strip() def __get_state(self, in_word) -> tuple: """Identifies reserved syntax elements and assigns an associated state. :param str in_word: input word :returns: (reserved word, state) where: * reserved word - is one of CIF syntax elements: data_, loop_, global_, save_, stop_ * state - the parser state required to process this next section. """ i = in_word.find("_") if i == -1: return None, "ST_UNKNOWN" try: reserved_word = in_word[:i].lower() return reserved_word, self.__state_dict[reserved_word] except KeyError: return None, "ST_UNKNOWN" def __parser(self, tokenizer, container_list): """Parser for PDBx data files and dictionaries. :param tokenizer: reentrant method recognizing data item names (_category.attribute), quoted strings (single, double and multi-line semi-colon delimited), and unquoted strings. :param list container_list: list-type container for data and definition objects parsed from from the input file. container_list is appended with data and definition objects. """ # Working container - data or definition current_container = None # Working category container category_index = {} current_category = None current_row = None state = None # Find the first reserved word and begin capturing data. for ( current_category_name, current_attribute_name, current_quoted_string, current_word, ) in tokenizer: if current_word is None: continue reserved_word, state = self.__get_state(current_word) if reserved_word is not None: break else: # empty file return while True: # Set the current state: at this point in the processing cycle we # are expecting a token containing # either a '_category.attribute' # or a reserved word. if current_category_name is not None: state = "ST_KEY_VALUE_PAIR" elif current_word is not None: reserved_word, state = self.__get_state(current_word) else: self.__syntax_error("Miscellaneous syntax error") return # Process _category.attribute value assignments if state == "ST_KEY_VALUE_PAIR": try: current_category = category_index[current_category_name] except KeyError: # A new category is encountered - create a container and # add a row category_index[current_category_name] = DataCategory( current_category_name ) current_category = category_index[current_category_name] try: current_container.append(current_category) except AttributeError: self.__syntax_error( "Category cannot be added to data_ block" ) return current_row = [] current_category.append(current_row) else: # Recover the existing row from the category try: current_row = current_category[0] except IndexError: self.__syntax_error( "Internal index error accessing category data" ) return # Check for duplicate attributes and add attribute to table. if current_attribute_name in current_category.attribute_list: self.__syntax_error( "Duplicate attribute encountered in category" ) return else: current_category.append_attribute(current_attribute_name) # Get the data for this attribute from the next token tok_category, _, current_quoted_string, current_word = next( tokenizer ) if tok_category is not None or ( current_quoted_string is None and current_word is None ): self.__syntax_error( "Missing data for item _%s.%s" % (current_category_name, current_attribute_name) ) if current_word == "?": current_row.append(None) elif current_word == ".": current_row.append("") elif current_word is not None: # Validation check token for misplaced reserved words reserved_word, state = self.__get_state(current_word) if reserved_word is not None: self.__syntax_error( "Unexpected reserved word: %s" % (reserved_word) ) current_row.append(current_word) elif current_quoted_string is not None: current_row.append(current_quoted_string) else: self.__syntax_error("Missing value in item-value pair") try: ( current_category_name, current_attribute_name, current_quoted_string, current_word, ) = next(tokenizer) except StopIteration: return continue # Process a loop_ declaration and associated data if state == "ST_TABLE": # The category name in the next current_category_name, # current_attribute_name pair defines the name of the category # container. ( current_category_name, current_attribute_name, current_quoted_string, current_word, ) = next(tokenizer) if (current_category_name is None) or ( current_attribute_name is None ): self.__syntax_error( "Unexpected token in loop_ declaration" ) return # Check for a previous category declaration. if current_category_name in category_index: self.__syntax_error( "Duplicate category declaration in loop_" ) return current_category = DataCategory(current_category_name) try: current_container.append(current_category) except AttributeError: self.__syntax_error( "loop_ declaration outside of data_ block or save_ " "frame" ) return current_category.append_attribute(current_attribute_name) # Read the rest of the loop_ declaration for ( current_category_name, current_attribute_name, current_quoted_string, current_word, ) in tokenizer: if current_category_name is None: break if current_category_name != current_category.name: self.__syntax_error( "Changed category name in loop_ declaration" ) return current_category.append_attribute(current_attribute_name) else: # formal CIF 1.1 grammar expects at least one value self.__syntax_error("loop_ without values") # If the next token is a 'word', check it for any reserved # words if current_word is not None: reserved_word, state = self.__get_state(current_word) if reserved_word is not None: if reserved_word == "stop": return else: self.__syntax_error( "Unexpected reserved word after loop " "declaration: %s" % (reserved_word) ) # Read the table of data for this loop_ while True: current_row = [] current_category.append(current_row) for _ in current_category.attribute_list: if current_word == "?": current_row.append(None) elif current_word == ".": current_row.append("") elif current_word is not None: current_row.append(current_word) elif current_quoted_string is not None: current_row.append(current_quoted_string) try: ( current_category_name, current_attribute_name, current_quoted_string, current_word, ) = next(tokenizer) except StopIteration: return # loop_ data processing ends if a new _category.attribute # is encountered if current_category_name is not None: break # A reserved word is encountered if current_word is not None: reserved_word, state = self.__get_state(current_word) if reserved_word is not None: break continue if state == "ST_DEFINITION": # Ignore trailing unnamed saveframe delimiters e.g. 'save_' state_name = self.__get_container_name(current_word) if state_name: current_container = DefinitionContainer(state_name) container_list.append(current_container) category_index = {} current_category = None elif state == "ST_DATA_CONTAINER": data_name = self.__get_container_name(current_word) if not data_name: data_name = "unidentified" current_container = DataContainer(data_name) container_list.append(current_container) category_index = {} current_category = None elif state == "ST_STOP": return elif state == "ST_GLOBAL": current_container = DataContainer("blank-global") current_container.set_global() container_list.append(current_container) category_index = {} current_category = None elif state == "ST_UNKNOWN": self.__syntax_error( "Unrecognized syntax element: " + str(current_word) ) return else: assert False, f"unhandled state {state}" try: ( current_category_name, current_attribute_name, current_quoted_string, current_word, ) = next(tokenizer) except StopIteration: return def __tokenizer(self, input_file): """Tokenizer method for the mmCIF syntax file. Each return/yield from this method returns information about the next token in the form of a tuple with the following structure: (category name, attribute name, quoted strings, words w/o quotes or white space) Differentiated the regular expression to the better handle embedded quotes. :param file input_file: file object ready for reading :rtype: Iterator[tuple] """ # Regex definition for mmCIF syntax - semi-colon delimited strings are # handled outside of this regex. mmcif_re = re.compile( r"(?:" r"(?:_(.+?)[.](\S+))" "|" # _category.attribute r"(?:['](.*?)(?:[']\s|[']$))" "|" # single quoted strings r"(?:[\"](.*?)(?:[\"]\s|[\"]$))" "|" # double quoted strings r"(?:\s*#.*$)" "|" # comments (dumped) r"(\S+)" # unquoted words r")" ) file_iterator = iter(input_file) # Tokenizer loop begins here for line in file_iterator: self.__current_line_number += 1 # Dump comments if line.startswith("#"): continue # Gobble up the entire semi-colon/multi-line-delimited string and # and stuff this into the string slot in the return tuple if line.startswith(";"): multiline_string = [line[1:]] for line in file_iterator: self.__current_line_number += 1 if line.startswith(";"): break multiline_string.append(line) else: self.__syntax_error("unterminated multi-line string") # remove trailing new-line that is part of the \n; delimiter multiline_string[-1] = multiline_string[-1].rstrip() yield (None, None, "".join(multiline_string), None) # Need to process the remainder of the current line - line = line[1:] # Apply regex to the current line consolidate the single/double # quoted within the quoted string category for match in mmcif_re.finditer(line): match_groups = match.groups() if match_groups != (None, None, None, None, None): if match_groups[2] is not None: quoted_string = match_groups[2] elif match_groups[3] is not None: quoted_string = match_groups[3] else: quoted_string = None groups = ( match_groups[0], match_groups[1], quoted_string, match_groups[4], ) yield groups def __tokenizer_org(self, input_file): """Tokenizer method for the mmCIF syntax file. Each return/yield from this method returns information about the next token in the form of a tuple with the following structure: (category name, attribute name, quoted strings, words w/o quotes or white space) :param file input_file: file object ready for reading :rtype: Iterator[tuple] """ # Regex definition for mmCIF syntax - semi-colon delimited strings are # handled outside of this regex. mmcif_re = re.compile( r"(?:" r"(?:_(.+?)[.](\S+))" "|" # _category.attribute r"(?:['\"](.*?)(?:['\"]\s|['\"]$))" "|" # quoted strings r"(?:\s*#.*$)" "|" # comments (dumped) r"(\S+)" # unquoted words r")" ) file_iterator = iter(input_file) # Tokenizer loop begins here while True: line = next(file_iterator) self.__current_line_number += 1 # Dump comments if line.startswith("#"): continue # Gobble up the entire semi-colon/multi-line delimited string and # and stuff this into the string slot in the return tuple if line.startswith(";"): multiline_string = [line[1:]] while True: line = next(file_iterator) self.__current_line_number += 1 if line.startswith(";"): break multiline_string.append(line) # remove trailing new-line that is part of the \n; delimiter multiline_string[-1] = multiline_string[-1].rstrip() yield (None, None, "".join(multiline_string), None) # Need to process the remainder of the current line line = line[1:] # Apply regex to the current line for match in mmcif_re.finditer(line): groups = match.groups() if groups != (None, None, None, None): yield groups