Main Page   Modules   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members   Related Pages  

Parser.h

Go to the documentation of this file.
00001 // -*- C++ -*-
00002 
00003 //=============================================================================
00004 /**
00005  *  @file    Parser.h
00006  *
00007  *  $Id: Parser.h,v 1.1.1.1 2003/02/21 18:36:32 chad Exp $
00008  *
00009  *  @author Nanbor Wang <nanbor@cs.wustl.edu>
00010  */
00011 //=============================================================================
00012 
00013 #ifndef _ACEXML_BASIC_PARSER_H_
00014 #define _ACEXML_BASIC_PARSER_H_
00015 
00016 #include "ace/pre.h"
00017 #include "ACEXML/parser/parser/Parser_export.h"
00018 
00019 #if !defined (ACE_LACKS_PRAGMA_ONCE)
00020 #pragma once
00021 #endif /* ACE_LACKS_PRAGMA_ONCE */
00022 
00023 #include "ACEXML/common/XMLReader.h"
00024 #include "ACEXML/common/LocatorImpl.h"
00025 #include "ACEXML/common/NamespaceSupport.h"
00026 #include "ACEXML/common/CharStream.h"
00027 #include "ace/Obstack.h"
00028 #include "ace/Functor.h"
00029 #include "ace/SString.h"
00030 #include "ace/Hash_Map_Manager.h"
00031 #include "ace/Containers_T.h"
00032 #include "ace/Auto_Ptr.h"
00033 #include "ACEXML/parser/parser/Entity_Manager.h"
00034 
00035 /**
00036  * @class ACEXML_Parser Parser.h "ACEXML/parser/parser/Parser.h"
00037  *
00038  * @brief A SAX based parser.
00039  *
00040  */
00041 class ACEXML_PARSER_Export ACEXML_Parser : public ACEXML_XMLReader
00042 {
00043 public:
00044   /// Default constructor.
00045   ACEXML_Parser (void);
00046 
00047   /// Destructor.
00048   virtual ~ACEXML_Parser (void);
00049 
00050   /*
00051    * Return the current content handler.
00052    */
00053   virtual ACEXML_ContentHandler *getContentHandler (void) const;
00054 
00055   /*
00056    * Return the current DTD handler.
00057    */
00058   virtual ACEXML_DTDHandler *getDTDHandler (void) const;
00059 
00060   /*
00061    * Return the current entity resolver.
00062    */
00063   virtual ACEXML_EntityResolver *getEntityResolver (void) const;
00064 
00065   /*
00066    * Return the current error handler.
00067    */
00068   virtual ACEXML_ErrorHandler *getErrorHandler (void) const;
00069 
00070   /**
00071    * Look up the value of a feature.  This method allows
00072    * programmers to check whether a specific feature has been
00073    * activated in the parser.
00074    */
00075   virtual int getFeature (const ACEXML_Char *name ACEXML_ENV_ARG_DECL)
00076           ACE_THROW_SPEC ((ACEXML_SAXNotRecognizedException,
00077                            ACEXML_SAXNotSupportedException)) ;
00078 
00079   /**
00080    * Activating or deactivating a feature.
00081    */
00082   virtual void setFeature (const ACEXML_Char *name,
00083                            int boolean_value ACEXML_ENV_ARG_DECL)
00084     ACE_THROW_SPEC ((ACEXML_SAXNotRecognizedException,
00085                      ACEXML_SAXNotSupportedException)) ;
00086 
00087   /*
00088    * Look up the value of a property.
00089    */
00090   virtual void * getProperty (const ACEXML_Char *name ACEXML_ENV_ARG_DECL)
00091     ACE_THROW_SPEC ((ACEXML_SAXNotRecognizedException,
00092                      ACEXML_SAXNotSupportedException)) ;
00093 
00094   /*
00095    * Set the value of a property.
00096    */
00097   virtual void setProperty (const ACEXML_Char *name,
00098                             void *value ACEXML_ENV_ARG_DECL)
00099     ACE_THROW_SPEC ((ACEXML_SAXNotRecognizedException,
00100                      ACEXML_SAXNotSupportedException)) ;
00101 
00102   /*
00103    * Parse an XML document.
00104    */
00105   virtual void parse (ACEXML_InputSource *input ACEXML_ENV_ARG_DECL)
00106     ACE_THROW_SPEC ((ACEXML_SAXException)) ;
00107 
00108   /*
00109    * Parse an XML document from a system identifier (URI).
00110    */
00111   virtual void parse (const ACEXML_Char *systemId ACEXML_ENV_ARG_DECL)
00112         ACE_THROW_SPEC ((ACEXML_SAXException))
00113     ;
00114 
00115   /*
00116    * Allow an application to register a content event handler.
00117    */
00118   virtual void setContentHandler (ACEXML_ContentHandler *handler);
00119 
00120   /*
00121    * Allow an application to register a DTD event handler.
00122    */
00123   virtual void setDTDHandler (ACEXML_DTDHandler *handler);
00124 
00125   /*
00126    * Allow an application to register an entity resolver.
00127    */
00128   virtual void setEntityResolver (ACEXML_EntityResolver *resolver);
00129 
00130   /*
00131    * Allow an application to register an error event handler.
00132    */
00133   virtual void setErrorHandler (ACEXML_ErrorHandler *handler);
00134 
00135   // *** Helper functions for parsing XML
00136 
00137   /**
00138    * Skip any whitespaces encountered until the first non-whitespace
00139    * character is encountered and consumed from the current input
00140    * CharStream.
00141    *
00142    * @param whitespace Return a pointer to the string of skipped
00143    * whitespace after proper conversion.  Null if there's no
00144    * whitespace found.
00145    *
00146    * @return The first none-white space characters (which will be
00147    * consumed from the CharStream.)  If no whitespace is found, it
00148    * returns 0.
00149    *
00150    * @sa skip_whitespace_count
00151    */
00152   ACEXML_Char skip_whitespace (ACEXML_Char **whitespace);
00153 
00154   /**
00155    * Skip any whitespaces encountered until the first non-whitespace
00156    * character.  The first non-whitespace character is not consumed.
00157    * This method does peek into the input CharStream and therefore
00158    * is more expensive than @ref skip_whitespace.
00159    *
00160    * @param peek If non-null, @a peek points to a ACEXML_Char where
00161    *        skip_whitespace_count stores the first non-whitespace
00162    *        character it sees (character is not removed from the stream.)
00163    *
00164    * @return The number of whitespace characters consumed.
00165    *
00166    * @sa skip_whitespace
00167    */
00168   int skip_whitespace_count (ACEXML_Char *peek = 0);
00169 
00170   /**
00171    * Check if a character @a c is a whitespace.
00172    *
00173    * @retval 1 if @a c is a valid white space character. 0 otherwise.
00174    */
00175   int is_whitespace (ACEXML_Char c);
00176 
00177   /**
00178    * Check if a character @a c is a whitespace or '='.
00179    *
00180    * @retval 1 if true, 0 otherwise.
00181    */
00182   int is_whitespace_or_equal (ACEXML_Char c);
00183 
00184   /**
00185    * Check if a character @a c is a valid character for nonterminal NAME.
00186    *
00187    * @retval 1 if true, 0 otherwise.
00188    */
00189   int is_nonname (ACEXML_Char c);
00190 
00191   /**
00192    * Skip an equal sign.
00193    *
00194    * @retval 0 when succeeds, -1 if no equal sign is found.
00195    */
00196   int skip_equal (void);
00197 
00198   /**
00199    * Get a quoted string.  Quoted strings are used to specify
00200    * attribute values and this routine will replace character and
00201    * entity references on-the-fly.  Parameter entities are not allowed
00202    * (or replaced) in this function.  (But regular entities are.)
00203    *
00204    * @param str returns the un-quoted string.
00205    *
00206    * @retval 0 on success, -1 otherwise.
00207    */
00208   int get_quoted_string (ACEXML_Char *&str);
00209 
00210   /**
00211    * Parse a PI statement.  The first character encountered
00212    * should always be '?' in the PI prefix "@<?".
00213    *
00214    * @retval 0 on success, -1 otherwise.
00215    */
00216   int parse_processing_instruction (ACEXML_ENV_SINGLE_ARG_DECL);
00217 
00218   /**
00219    * Skip over a comment. The first character encountered
00220    * should always be the first '-' in the comment prefix
00221    * "@<@!--".
00222    */
00223   int grok_comment ();
00224 
00225   /**
00226    * Read a name from the input CharStream (until white space).
00227    * If @a ch @!= 0, then we have already consumed the first name
00228    * character from the input CharStream, otherwise, read_name
00229    * will use this->get() to acquire the initial character.
00230    *
00231    * @return A pointer to the string in the obstack, 0 if it's not
00232    * a valid name.
00233    */
00234   ACEXML_Char *read_name (ACEXML_Char ch = 0);
00235 
00236   /**
00237    * Parse the DOCTYPE declaration.  The first character encountered
00238    * should always be  'D' in doctype prefix: "@<@!DOCTYPE".
00239    */
00240   int parse_doctypedecl (ACEXML_ENV_SINGLE_ARG_DECL)
00241         ACE_THROW_SPEC ((ACEXML_SAXException))
00242     ;
00243 
00244   /**
00245    * Parse an XML element.  The first character encountered should
00246    * be the first character of the element "Name".
00247    *
00248    * @param is_root If not 0, then we are expecting to see the "root"
00249    * element now, and the next element's name need to match the name
00250    * defined in DOCTYPE definition, i.e., @a this->doctype_.
00251    *
00252    * @todo Instead of simply checking for the root element based on the
00253    * argument @a is_root, we should instead either pass in some sort
00254    * of validator or allow the function to return the element name so it
00255    * can be used in a validator.
00256    */
00257   void parse_element (int is_root ACEXML_ENV_ARG_DECL)
00258         ACE_THROW_SPEC ((ACEXML_SAXException))
00259     ;
00260 
00261   /**
00262    * Parse XML Prolog.
00263    */
00264   void parse_xml_prolog (ACEXML_ENV_SINGLE_ARG_DECL)
00265     ACE_THROW_SPEC ((ACEXML_SAXException));
00266 
00267 
00268   /**
00269    * Parse a character reference, i.e., "&#x20;" or "&#30;".   The first
00270    * character encountered should be the '#' char.
00271    *
00272    * @param buf points to a character buffer for the result.
00273    * @param len specifies the capacities of the buffer.
00274    *
00275    * @retval 0 on success and -1 otherwise.
00276    */
00277   int parse_char_reference (ACEXML_Char *buf, size_t len);
00278 
00279   /**
00280    * Parse an entity reference, i.e., "&amp;".  The first character
00281    * encountered should be the character following '&'.
00282    *
00283    * @return A pointer to the resolved const ACEXML_String if success
00284    * (previously defined), 0 otherwise.
00285    */
00286   const ACEXML_String *parse_reference (void);
00287 
00288   /**
00289    * Parse a CDATA section.  The first character should always be the first
00290    * '[' in CDATA definition.
00291    *
00292    * @retval 0 on success.
00293    * @retval -1 if fail.
00294    */
00295   int parse_cdata (ACEXML_ENV_SINGLE_ARG_DECL);
00296 
00297   /**
00298    * Parse a "markupdecl" section, this includes both "markupdecl" and
00299    * "DeclSep" sections in XML specification
00300    */
00301   int parse_internal_dtd (ACEXML_ENV_SINGLE_ARG_DECL);
00302 
00303   /**
00304    * Parse an "ELEMENT" decl.  The first character this method
00305    * expects is always the 'L' (the second char) in the word
00306    * "ELEMENT".
00307    *
00308    * @retval 0 on success, -1 otherwise.
00309    */
00310   int parse_element_decl (ACEXML_ENV_SINGLE_ARG_DECL);
00311 
00312   /**
00313    * Parse an "ENTITY" decl.  The first character this method expects
00314    * is always the 'N' (the second char) in the word "ENTITY".
00315    *
00316    * @retval 0 on success, -1 otherwise.
00317    */
00318   int parse_entity_decl (ACEXML_ENV_SINGLE_ARG_DECL);
00319 
00320   /**
00321    * Parse an "ATTLIST" decl.  Thse first character this method
00322    * expects is always the 'A' (the first char) in the word
00323    * "ATTLIST".
00324    *
00325    * @retval 0 on success, -1 otherwise.
00326    */
00327   int parse_attlist_decl (ACEXML_ENV_SINGLE_ARG_DECL);
00328 
00329   /**
00330    *Parse a "NOTATION" decl.  The first character this method
00331    * expects is always the 'N' (the first char) in the word
00332    * "NOTATION".
00333    *
00334    * @retval 0 on success, -1 otherwise.
00335    */
00336   int parse_notation_decl (ACEXML_ENV_SINGLE_ARG_DECL);
00337 
00338   /**
00339    * Parse an ExternalID or a reference to PUBLIC ExternalID.
00340    * Possible cases are in the forms of: <code>
00341    *
00342    * SYSTEM 'quoted string representing system resource'
00343    * PUBLIC 'quoted name of public ID' 'quoted resource'
00344    * PUBLIC 'quoted name we are referring to'
00345    * </code>
00346    *
00347    * The first character this function sees must be either 'S' or 'P'.
00348    * When the function finishes parsing, the input stream points
00349    * at the first non-whitespace character.
00350    *
00351    * @param publicId returns the unquoted publicId read.  If none
00352    *        is available, it will be reset to 0.
00353    * @param systemId returns the unquoted systemId read.  If none
00354    *        is available, it will be reset to 0.
00355    *
00356    * @retval 0 on success, -1 otherwise.
00357    */
00358   int parse_external_id_and_ref (ACEXML_Char *&publicId,
00359                                  ACEXML_Char *&systemId ACEXML_ENV_ARG_DECL);
00360 
00361   /**
00362    * Parse the "children" and "Mixed" non-terminals in contentspec.
00363    *
00364    * The first character this function sees must be the first
00365    * open paren '(' in children.
00366    *
00367    * @retval 0 on success, -1 otherwise.
00368    */
00369   int parse_children_definition (ACEXML_ENV_SINGLE_ARG_DECL);
00370 
00371   /**
00372    * Parse a @c cp non-terminal.  @c cp can either be a @c seq or a @c choice.
00373    * This function calls itself recursively.
00374    *
00375    * @param skip_open_paren when non-zero, it indicates that the open paren of
00376    *        the @c seq or @c choice has already been removed from the input
00377    *        stream.
00378    *
00379    * @retval 0 on success, -1 otherwise.
00380    */
00381   int parse_child (int skip_open_paren ACEXML_ENV_ARG_DECL);
00382 
00383 protected:
00384   /// Get a character.
00385   ACEXML_Char get (void);
00386 
00387   /// Peek a character.
00388   ACEXML_Char peek (void);
00389 
00390   /**
00391    * Check if more data can be added to a character buffer in obstack.
00392    * If not, the existing data in the buffer will be cleared out by
00393    * freezing the segment and pass it out thru a content_handler_->characters ()
00394    * call.  @a counter records the length of the existing data in
00395    * obstack.
00396    */
00397   int try_grow_cdata (size_t size, size_t &len ACEXML_ENV_ARG_DECL);
00398 
00399   // Feature names:
00400 
00401   /**
00402    * \addtogroup acexml_parser_features
00403    * @{
00404    */
00405 
00406   /**
00407    * @var simple_parsing_feature_
00408    *
00409    * This constant string defines the name of "simple XML parsing"
00410    * feature.  When this feature is enabled, ACEXML parser is allowed
00411    * to parse a simple XML stream without mandated XML prolog
00412    * and no DTD defintion.
00413    */
00414   static const ACEXML_Char simple_parsing_feature_[];
00415 
00416   /**
00417    * @var namespaces_feature_
00418    *
00419    * This constant string defines the SAX XML Namespace feature. When this
00420    * feature is enabled, ACEXML parser allows access by namespace qualified
00421    * names.
00422    */
00423   static const ACEXML_Char namespaces_feature_[];
00424 
00425   /**
00426    *  @var namespace_prefixes_feature_
00427    *
00428    *  This constant string defines the SAX XML Namespace prefixes feature.
00429    *  Normally the list of attributes returned by the parser will not
00430    *  contain attributes used as namespace declarations (xmlns*). When this
00431    *  feature is enabled, the list of attributes contains the namespace
00432    *  declarations also.
00433    */
00434   static const ACEXML_Char namespace_prefixes_feature_[];
00435 
00436   /* @} */
00437 
00438 private:
00439   /**
00440    * Dispatch errors to ErrorHandler.
00441    *
00442    */
00443   void report_error (const ACEXML_Char* message ACEXML_ENV_ARG_DECL);
00444 
00445   /**
00446    * Dispatch warnings to ErrorHandler.
00447    *
00448    */
00449   void report_warning (const ACEXML_Char* message ACEXML_ENV_ARG_DECL);
00450 
00451   /**
00452    * Dispatch fatal errors to ErrorHandler.
00453    *
00454    */
00455   void report_fatal_error (const ACEXML_Char* message ACEXML_ENV_ARG_DECL);
00456 
00457   /**
00458    * Dispatch prefix mapping calls to the ContentHandler.
00459    *
00460    * @param prefix Namespace prefix
00461    * @param uri Namespace URI
00462    * @param name Local name
00463    * @param start 1 => startPrefixMapping 0 => endPrefixMapping
00464    */
00465   void report_prefix_mapping (const ACEXML_Char* prefix,
00466                               const ACEXML_Char* uri,
00467                               const ACEXML_Char* name,
00468                               int start ACEXML_ENV_ARG_DECL);
00469   /**
00470    *  Parse a keyword.
00471    */
00472   int parse_token (const ACEXML_Char* keyword);
00473 
00474   /// Keeping track of the handlers. We do not manage the memory for
00475   /// handlers.
00476   ACEXML_DTDHandler *dtd_handler_;
00477   ACEXML_EntityResolver *entity_resolver_;
00478   ACEXML_ContentHandler *content_handler_;
00479   ACEXML_ErrorHandler *error_handler_;
00480 
00481   /// @@ Feature and properties management structure here.
00482   /// Current input char stream.
00483   ACEXML_CharStream *instream_;
00484 
00485   /// My doctype, if any.
00486   ACEXML_Char *doctype_;
00487 
00488   /// External DTD System Literal, if any.
00489   ACEXML_Char *dtd_system_;
00490 
00491   /// External DTD Public Literal, if any.
00492   ACEXML_Char *dtd_public_;
00493 
00494   ACE_Obstack_T<ACEXML_Char> obstack_;
00495 
00496   ACEXML_NamespaceSupport xml_namespace_;
00497 
00498   ACEXML_Entity_Manager entities_;
00499 
00500   // Locator
00501   ACEXML_LocatorImpl locator_;
00502 
00503   // Feature flags &
00504   int simple_parsing_;
00505   int namespaces_;
00506   int namespace_prefixes_;
00507 
00508 };
00509 
00510 #if defined (__ACEXML_INLINE__)
00511 # include "ACEXML/parser/parser/Parser.i"
00512 #endif /* __ACEXML_INLINE__ */
00513 
00514 #include "ace/post.h"
00515 
00516 #endif /* _ACEXML_BASIC_PARSER_H_ */

Generated on Mon Jun 16 13:23:24 2003 for ACEXML by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002