lmi
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[lmi] [PATCH] validate documents against schema when loading


From: Vaclav Slavik
Subject: [lmi] [PATCH] validate documents against schema when loading
Date: Mon, 18 Mar 2013 14:52:31 +0100
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:17.0) Gecko/20130307 Thunderbird/17.0.4

Hi,

below is a patch that implements validation of single_cell_document and
multiple_cell_document data against their respective XML Schema when
loading from files.

It only loads the schema -- as well as the sort_cell_subelements.xsl
stylesheet -- only once, when first used, and the first use is more
expensive than subsequent ones.

I didn't have real-life data files, so I tested performance only in
input_test.cpp, with sample.*, but it looked good: validating a document
is apparently much faster than loading a schema, so the initial cost of
loading the schema is amortized quickly, even with very small data files
that can be validated quickly (and isn't that large to begin with). If
you want to experiment with it yourself, the two caching methods are
schema() and pre_validate_transform() in both classes.

The patch requires xmlwrapp-0.7, see
http://vslavik.github.com/xmlwrapp/ for release tarball. (It contains
other useful things such as better error reporting API, so I highly
recommend upgrading to it.)

Regards,
Vaclav


---
 multiple_cell_document.cpp | 48 ++++++++++++++++++++++++++++++++++++++++++---
 multiple_cell_document.hpp |  5 ++++-
 single_cell_document.cpp   | 49 +++++++++++++++++++++++++++++++++++++++++++---
 single_cell_document.hpp   |  5 ++++-
 xml_lmi_fwd.hpp            |  6 ++++++
 5 files changed, 105 insertions(+), 8 deletions(-)

diff --git a/multiple_cell_document.cpp b/multiple_cell_document.cpp
index 901b56f..9f32d2b 100644
--- a/multiple_cell_document.cpp
+++ b/multiple_cell_document.cpp
@@ -30,10 +30,13 @@
 
 #include "alert.hpp"
 #include "assert_lmi.hpp"
+#include "data_directory.hpp"
 #include "value_cast.hpp"
 #include "xml_lmi.hpp"
 
 #include <xmlwrapp/nodes_view.h>
+#include <xmlwrapp/schema.h>
+#include <xsltwrapp/stylesheet.h>
 
 #include <istream>
 #include <iterator>                     // std::distance()
@@ -53,7 +56,7 @@ multiple_cell_document::multiple_cell_document()
 multiple_cell_document::multiple_cell_document(std::string const& filename)
 {
     xml_lmi::dom_parser parser(filename);
-    parse(parser.root_node(xml_root_name()));
+    parse(parser);
     assert_vector_sizes_are_sane();
 }
 
@@ -92,6 +95,20 @@ std::string const& multiple_cell_document::xml_root_name() 
const
     return s;
 }
 
+xml::schema const& multiple_cell_document::schema() const
+{
+    static xml::schema const s
+        
(xml_lmi::dom_parser(AddDataDir("multiple_cell_document.xsd")).document());
+    return s;
+}
+
+xslt::stylesheet& multiple_cell_document::pre_validate_transform() const
+{
+    static xslt::stylesheet s
+        
(xml_lmi::dom_parser(AddDataDir("sort_cell_subelements.xsl")).document());
+    return s;
+}
+
 namespace
 {
 /// Throw an exception while pretending to return an lvalue.
@@ -113,10 +130,33 @@ T& hurl(std::string const& s)
 }
 } // Unnamed namespace.
 
+/// Validate correctness of the file.
+
+void multiple_cell_document::validate(xml::document const& doc)
+{
+    try
+        {
+        // Sorting is a necessary first step because the system that
+        // provides the file will not change its format:
+        xml::document const& transformed = pre_validate_transform().apply(doc);
+        schema().validate(transformed);
+        }
+    catch(std::exception const& e)
+        {
+        fatal_error()
+            << "The document is not valid:\n"
+            << e.what()
+            << LMI_FLUSH
+            ;
+        }
+}
+
 /// Read xml into vectors of class Input.
 
-void multiple_cell_document::parse(xml::element const& root)
+void multiple_cell_document::parse(xml_lmi::dom_parser& parser)
 {
+    xml::element const& root(parser.root_node(xml_root_name()));
+
     int file_version = 0;
     if(!xml_lmi::get_attr(root, "version", file_version))
         {
@@ -131,6 +171,8 @@ void multiple_cell_document::parse(xml::element const& root)
         fatal_error() << "Incompatible file version." << LMI_FLUSH;
         }
 
+    validate(parser.document());
+
     case_parms_ .clear();
     class_parms_.clear();
     cell_parms_ .clear();
@@ -352,7 +394,7 @@ void multiple_cell_document::parse_v0(xml::element const& 
root)
 void multiple_cell_document::read(std::istream const& is)
 {
     xml_lmi::dom_parser parser(is);
-    parse(parser.root_node(xml_root_name()));
+    parse(parser);
 }
 
 //============================================================================
diff --git a/multiple_cell_document.hpp b/multiple_cell_document.hpp
index 0eae9cd..b9483db 100644
--- a/multiple_cell_document.hpp
+++ b/multiple_cell_document.hpp
@@ -58,13 +58,16 @@ class LMI_SO multiple_cell_document
     void write(std::ostream&) const;
 
   private:
-    void parse   (xml::element const&);
+    void validate(xml::document const&);
+    void parse   (xml_lmi::dom_parser& parser);
     void parse_v0(xml::element const&);
 
     void assert_vector_sizes_are_sane() const;
 
     int                class_version() const;
     std::string const& xml_root_name() const;
+    xml::schema const& schema() const;
+    xslt::stylesheet&  pre_validate_transform() const;
 
     // Default parameters for the whole case, stored as a vector for
     // parallelism with class_parms_ and cell_parms_. Naturally, this
diff --git a/single_cell_document.cpp b/single_cell_document.cpp
index 2874a68..1f55cb7 100644
--- a/single_cell_document.cpp
+++ b/single_cell_document.cpp
@@ -29,9 +29,12 @@
 #include "single_cell_document.hpp"
 
 #include "assert_lmi.hpp"
+#include "data_directory.hpp"
 #include "xml_lmi.hpp"
 
 #include <xmlwrapp/nodes_view.h>
+#include <xmlwrapp/schema.h>
+#include <xsltwrapp/stylesheet.h>
 
 #include <istream>
 #include <ostream>
@@ -54,7 +57,7 @@ single_cell_document::single_cell_document(std::string const& 
filename)
     :input_data_()
 {
     xml_lmi::dom_parser parser(filename);
-    parse(parser.root_node(xml_root_name()));
+    parse(parser);
 }
 
 //============================================================================
@@ -69,9 +72,49 @@ std::string const& single_cell_document::xml_root_name() 
const
     return s;
 }
 
+xml::schema const& single_cell_document::schema() const
+{
+    static xml::schema const s
+        
(xml_lmi::dom_parser(AddDataDir("single_cell_document.xsd")).document());
+    return s;
+}
+
+xslt::stylesheet& single_cell_document::pre_validate_transform() const
+{
+    static xslt::stylesheet s
+        
(xml_lmi::dom_parser(AddDataDir("sort_cell_subelements.xsl")).document());
+    return s;
+}
+
 //============================================================================
-void single_cell_document::parse(xml::element const& root)
+/// Validate correctness of the file.
+
+void single_cell_document::validate(xml::document const& doc)
 {
+    try
+        {
+        // Sorting is a necessary first step because the system that
+        // provides the file will not change its format:
+        xml::document const& transformed = pre_validate_transform().apply(doc);
+        schema().validate(transformed);
+        }
+    catch(std::exception const& e)
+        {
+        fatal_error()
+            << "The document is not valid:\n"
+            << e.what()
+            << LMI_FLUSH
+            ;
+        }
+}
+
+//============================================================================
+void single_cell_document::parse(xml_lmi::dom_parser& parser)
+{
+    validate(parser.document());
+
+    xml::element const& root(parser.root_node(xml_root_name()));
+
     xml::const_nodes_view const elements(root.elements());
     LMI_ASSERT(!elements.empty());
     xml::const_nodes_view::const_iterator i(elements.begin());
@@ -86,7 +129,7 @@ void single_cell_document::parse(xml::element const& root)
 void single_cell_document::read(std::istream const& is)
 {
     xml_lmi::dom_parser parser(is);
-    parse(parser.root_node(xml_root_name()));
+    parse(parser);
 }
 
 //============================================================================
diff --git a/single_cell_document.hpp b/single_cell_document.hpp
index 67e723f..97a8f28 100644
--- a/single_cell_document.hpp
+++ b/single_cell_document.hpp
@@ -54,8 +54,11 @@ class LMI_SO single_cell_document
     void write(std::ostream&) const;
 
   private:
-    void parse(xml::element const&);
+    void validate(xml::document const&);
+    void parse(xml_lmi::dom_parser&);
     std::string const& xml_root_name() const;
+    xml::schema const& schema() const;
+    xslt::stylesheet&  pre_validate_transform() const;
 
     Input input_data_;
 };
diff --git a/xml_lmi_fwd.hpp b/xml_lmi_fwd.hpp
index 0ded1e7..bb9e07f 100644
--- a/xml_lmi_fwd.hpp
+++ b/xml_lmi_fwd.hpp
@@ -32,6 +32,7 @@ namespace xml // This is xmlwrapp's namespace.
     class document;
     class init;
     class node;
+    class schema;
     class tree_parser;
 
     /// XMLWRAPP !! It is useful to distinguish elements from DOM
@@ -40,6 +41,11 @@ namespace xml // This is xmlwrapp's namespace.
     typedef xml::node element;
 } // namespace xml
 
+namespace xslt // This is xsltwrapp's namespace.
+{
+    class stylesheet;
+} // namespace xml
+
 /// Interface to xmlwrapp.
 
 namespace xml_lmi
-- 
1.8.2





reply via email to

[Prev in Thread] Current Thread [Next in Thread]