diff --git a/packtools/stylechecker.py b/packtools/stylechecker.py new file mode 100644 index 000000000..c98c93253 --- /dev/null +++ b/packtools/stylechecker.py @@ -0,0 +1,119 @@ +import re +import os +import logging + +from lxml import etree + +from packtools.utils import setdefault + + +HERE = os.path.dirname(os.path.abspath(__file__)) +SCHEMAS = { + 'SciELO-journalpublishing1.xsd': os.path.join(HERE, 'sps_xsd', 'sps.xsd'), +} +EXPOSE_ELEMENTNAME_PATTERN = re.compile(r"(?<=Element )'.*?'") + +logger = logging.getLogger(__name__) + + +def XMLSchema(schema_name): + with open(SCHEMAS[schema_name]) as fp: + xmlschema_doc = etree.parse(fp) + + xmlschema = etree.XMLSchema(xmlschema_doc) + return xmlschema + + +class XML(object): + def __init__(self, file): + """ + :param file: Path to the XML file or etree. + """ + if isinstance(file, etree._ElementTree): + self.lxml = file + else: + self.lxml = etree.parse(file) + + self.xmlschema = XMLSchema('SciELO-journalpublishing1.xsd') + + def find(self, tagname, lineno): + for elem in self.lxml.findall('//' + tagname): + if elem.sourceline == lineno: + logger.debug('method *find*: hit a regular element: %s.' % tagname) + return elem + else: + root = self.lxml.getroot() + if root.tag == tagname: + logger.debug('method *find*: hit a root element.') + return root + + + def validate(self): + result = setdefault(self, '__validation_result', lambda: self.xmlschema.validate(self.lxml)) + errors = setdefault(self, '__validation_errors', lambda: self.xmlschema.error_log) + return result, errors + + def annotate_errors(self): + result, errors = self.validate() + + for error in errors: + match = EXPOSE_ELEMENTNAME_PATTERN.search(error.message) + if match is None: + raise ValueError('Could not locate the element name in %s.' % error.message) + else: + element_name = match.group(0).strip("'") + + err_element = self.find(element_name, error.line) + if err_element is None: + raise ValueError('Could not locate the erratic element %s at line %s to annotate: %s.' % (element_name, error.line, error.message)) + + notice_element = etree.Element('SPS-ERROR') + notice_element.text = error.message + try: + err_element.addprevious(notice_element) + except TypeError: + # In case of a root element, a comment if added. + err_element.addprevious(etree.Comment('SPS-ERROR: %s' % error.message)) + + def __str__(self): + return etree.tostring(self.lxml, pretty_print=True, + encoding='utf-8', xml_declaration=True) + + def __unicode__(self): + return str(self).decode('utf-8') + + def __repr__(self): + return '' % (self.lxml, self.validate()[0]) + + def read(self): + """ + Read the XML contents as text. + """ + return unicode(self) + + +if __name__ == '__main__': + import argparse + import sys + + parser = argparse.ArgumentParser(description='stylechecker cli utility.') + parser.add_argument('--annotated', action='store_true') + parser.add_argument('xmlpath', help='Absolute or relative path to the XML file.') + + args = parser.parse_args() + xml = XML(args.xmlpath) + + is_valid, errors = xml.validate() + + if args.annotated: + xml.annotate_errors() + sys.stdout.write(str(xml)) + + else: + if not is_valid: + print 'Invalid XML! Found %s errors:' % len(errors) + for err in errors: + print '%s,%s\t%s' % (err.line, err.column, err.message) + else: + print 'Valid XML! ;)' + diff --git a/packtools/utils.py b/packtools/utils.py index c286caebe..402a4a9d4 100644 --- a/packtools/utils.py +++ b/packtools/utils.py @@ -52,3 +52,13 @@ def checksum_file(filepath, callable): return hash.hexdigest() + +def setdefault(object, attribute, producer): + """ + Like dict().setdefault but for object attributes. + """ + if not hasattr(object, attribute): + setattr(object, attribute, producer()) + + return getattr(object, attribute) + diff --git a/packtools/xray.py b/packtools/xray.py index 5d75c36fc..aa0ef64a8 100644 --- a/packtools/xray.py +++ b/packtools/xray.py @@ -8,20 +8,13 @@ from lxml import etree from . import utils +from . import stylechecker logger = logging.getLogger(__name__) -def get_xmlschema(path): - xmlschema_doc = etree.parse(open(path, 'r')) - xmlschema = etree.XMLSchema(xmlschema_doc) - return xmlschema - - class SPSMixin(object): - xmlschema = get_xmlschema(os.path.join( - os.path.dirname(os.path.abspath(__file__)), 'sps_xsd', 'sps.xsd')) @property def xmls(self): @@ -56,6 +49,13 @@ def meta(self): return dct_mta + def is_valid_schema(self): + """ + Checks if the XML is valid against SPS XSD. + More info at: https://github.com/scieloorg/scielo_publishing_schema + """ + return self.stylechecker.validate()[0] + def is_valid_meta(self): """ Checks if the minimum required data to identify a package is present. @@ -65,13 +65,6 @@ def is_valid_meta(self): meta['journal_eissn'] or meta['journal_pissn']) and ( meta['issue_volume'] or meta['issue_number'])) - def is_valid_schema(self): - """ - Checks if the XML is valid against SPS XSD. - More info at: https://github.com/scieloorg/scielo_publishing_schema - """ - return self.xmlschema.validate(self.xml) - def is_valid_package(self): """ Validate if exist at least one XML file and one PDF file @@ -91,6 +84,10 @@ def is_valid(self): """ return self.is_valid_package() and self.is_valid_schema() and self.is_valid_meta() + @property + def stylechecker(self): + return utils.setdefault(self, '__stylechecker', lambda: stylechecker.XML(self.xml)) + class Xray(object): diff --git a/tests/test_stylechecker.py b/tests/test_stylechecker.py new file mode 100644 index 000000000..049e49850 --- /dev/null +++ b/tests/test_stylechecker.py @@ -0,0 +1,144 @@ +#coding: utf-8 +import unittest +from StringIO import StringIO +from tempfile import NamedTemporaryFile + +from lxml import etree + +from packtools import stylechecker + + +# valid: +# invalid: anything else +sample_xsd = StringIO('''\ + + + + + + + + +''') + + +def setup_tmpfile(method): + def wrapper(self): + valid_tmpfile = NamedTemporaryFile() + valid_tmpfile.write(b'bar') + valid_tmpfile.seek(0) + self.valid_tmpfile = valid_tmpfile + + method(self) + + self.valid_tmpfile.close() + return wrapper + + +class XMLTests(unittest.TestCase): + + @setup_tmpfile + def test_initializes_with_filepath(self): + self.assertTrue(stylechecker.XML(self.valid_tmpfile.name)) + + def test_initializes_with_etree(self): + fp = StringIO(b'bar') + et = etree.parse(fp) + + self.assertTrue(stylechecker.XML(et)) + + def test_validation(self): + fp = etree.parse(StringIO(b'bar')) + xml = stylechecker.XML(fp) + xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd)) + + result, errors = xml.validate() + self.assertTrue(result) + self.assertFalse(errors) + + def test_invalid(self): + fp = etree.parse(StringIO(b'bar')) + xml = stylechecker.XML(fp) + xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd)) + + result, _ = xml.validate() + self.assertFalse(result) + + def test_invalid_errors(self): + # Default lxml error log. + fp = etree.parse(StringIO(b'bar')) + xml = stylechecker.XML(fp) + xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd)) + + _, errors = xml.validate() + self.assertIsInstance(errors, etree._ListErrorLog) + + def test_find(self): + fp = etree.parse(StringIO(b'\nbar\n')) + xml = stylechecker.XML(fp) + xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd)) + + elem = xml.find('b', 2) + self.assertEqual(elem.tag, 'b') + self.assertEqual(elem.sourceline, 2) + + def test_find_root_element(self): + fp = etree.parse(StringIO(b'\nbar\n')) + xml = stylechecker.XML(fp) + xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd)) + + elem = xml.find('a', 1) + self.assertEqual(elem.tag, 'a') + self.assertEqual(elem.sourceline, 1) + + def test_find_missing(self): + fp = etree.parse(StringIO(b'\nbar\n')) + xml = stylechecker.XML(fp) + xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd)) + + self.assertIsNone(xml.find('c', 2)) + + def test_annotate_errors(self): + fp = etree.parse(StringIO(b'bar')) + xml = stylechecker.XML(fp) + xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd)) + + xml.annotate_errors() + self.assertIn("Element 'c': This element is not expected. Expected is ( b ).", str(xml)) + + def test_annotate_errors(self): + fp = etree.parse(StringIO(b'bar')) + xml = stylechecker.XML(fp) + xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd)) + + xml.annotate_errors() + xml_text = xml.read() + + self.assertIn("Element 'c': This element is not expected. Expected is ( b ).", xml_text) + self.assertTrue(isinstance(xml_text, unicode)) + + +class ElementNamePatternTests(unittest.TestCase): + pattern = stylechecker.EXPOSE_ELEMENTNAME_PATTERN + + def test_case1(self): + message = "Element 'article', attribute 'dtd-version': [facet 'enumeration'] The value '3.0' is not an element of the set {'1.0'}." + self.assertEqual(self.pattern.search(message).group(0), "'article'") + + + def test_case2(self): + message = "Element 'article', attribute 'dtd-version': '3.0' is not a valid value of the local atomic type." + self.assertEqual(self.pattern.search(message).group(0), "'article'") + + def test_case3(self): + message = "Element 'author-notes': This element is not expected. Expected is one of ( label, title, ack, app-group, bio, fn-group, glossary, ref-list, notes, sec )." + self.assertEqual(self.pattern.search(message).group(0), "'author-notes'") + + def test_case4(self): + message = "Element 'journal-title-group': This element is not expected. Expected is ( journal-id )." + self.assertEqual(self.pattern.search(message).group(0), "'journal-title-group'") + + def test_case5(self): + message = "Element 'contrib-group': This element is not expected. Expected is one of ( article-id, article-categories, title-group )." + self.assertEqual(self.pattern.search(message).group(0), "'contrib-group'") + diff --git a/tests/test_xray.py b/tests/test_xray.py index 8fead915e..ac73ee6cf 100644 --- a/tests/test_xray.py +++ b/tests/test_xray.py @@ -1,23 +1,25 @@ #coding: utf-8 +import unittest import zipfile from tempfile import NamedTemporaryFile -from lxml import etree +from lxml import etree import mocker -import unittest from packtools import xray as x_ray -class SPSMixinTests(mocker.MockerTestCase): +def make_test_archive(arch_data): + fp = NamedTemporaryFile() + with zipfile.ZipFile(fp, 'w') as zipfp: + for archive, data in arch_data: + zipfp.writestr(archive, data) - def _make_test_archive(self, arch_data): - fp = NamedTemporaryFile() - with zipfile.ZipFile(fp, 'w') as zipfp: - for archive, data in arch_data: - zipfp.writestr(archive, data) + return fp + + +class SPSMixinTests(mocker.MockerTestCase): - return fp def _makeOne(self, fname): class Foo(x_ray.SPSMixin, x_ray.Xray): @@ -27,7 +29,7 @@ class Foo(x_ray.SPSMixin, x_ray.Xray): def test_xmls_yields_etree_instances(self): data = [('bar.xml', b'bar')] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) xmls = pkg.xmls @@ -35,7 +37,7 @@ def test_xmls_yields_etree_instances(self): def test_xml_returns_etree_instance(self): data = [('bar.xml', b'bar')] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertIsInstance(pkg.xml, etree._ElementTree) @@ -45,7 +47,7 @@ def test_xml_raises_AttributeError_when_multiple_xmls(self): ('bar.xml', b'bar'), ('baz.xml', b'baz'), ] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertRaises(AttributeError, lambda: pkg.xml) @@ -54,7 +56,7 @@ def test_meta_journal_title_data_is_fetched(self): data = [ ('bar.xml', b'foo'), ] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertEqual(pkg.meta['journal_title'], 'foo') @@ -63,7 +65,7 @@ def test_meta_journal_title_is_None_if_not_present(self): data = [ ('bar.xml', b''), ] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertIsNone(pkg.meta['journal_title']) @@ -72,7 +74,7 @@ def test_meta_journal_eissn_data_is_fetched(self): data = [ ('bar.xml', b'1234-1234'), ] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertEqual(pkg.meta['journal_eissn'], '1234-1234') @@ -81,7 +83,7 @@ def test_meta_journal_eissn_is_None_if_not_present(self): data = [ ('bar.xml', b''), ] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertIsNone(pkg.meta['journal_eissn']) @@ -90,7 +92,7 @@ def test_meta_journal_pissn_data_is_fetched(self): data = [ ('bar.xml', b'1234-1234'), ] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertEqual(pkg.meta['journal_pissn'], '1234-1234') @@ -99,7 +101,7 @@ def test_meta_journal_pissn_is_None_if_not_present(self): data = [ ('bar.xml', b''), ] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertIsNone(pkg.meta['journal_pissn']) @@ -108,7 +110,7 @@ def test_meta_article_title_data_is_fetched(self): data = [ ('bar.xml', b'bar'), ] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertEqual(pkg.meta['article_title'], 'bar') @@ -117,7 +119,7 @@ def test_meta_article_title_is_None_if_not_present(self): data = [ ('bar.xml', b''), ] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertIsNone(pkg.meta['article_title']) @@ -126,7 +128,7 @@ def test_meta_issue_year_data_is_fetched(self): data = [ ('bar.xml', b'2013'), ] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertEqual(pkg.meta['issue_year'], '2013') @@ -135,7 +137,7 @@ def test_meta_issue_year_is_None_if_not_present(self): data = [ ('bar.xml', b''), ] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertIsNone(pkg.meta['issue_year']) @@ -144,7 +146,7 @@ def test_meta_issue_volume_data_is_fetched(self): data = [ ('bar.xml', b'2'), ] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertEqual(pkg.meta['issue_volume'], '2') @@ -153,7 +155,7 @@ def test_meta_issue_volume_is_None_if_not_present(self): data = [ ('bar.xml', b''), ] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertIsNone(pkg.meta['issue_volume']) @@ -162,7 +164,7 @@ def test_meta_issue_number_data_is_fetched(self): data = [ ('bar.xml', b'2'), ] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertEqual(pkg.meta['issue_number'], '2') @@ -171,7 +173,7 @@ def test_meta_issue_number_is_None_if_not_present(self): data = [ ('bar.xml', b''), ] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertIsNone(pkg.meta['issue_number']) @@ -180,7 +182,7 @@ def test_meta_is_not_valid(self): data = [ ('bar.xml', b''), ] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertFalse(pkg.is_valid_meta()) @@ -189,7 +191,7 @@ def test_meta_is_valid(self): data = [ ('bar.xml', b'12-343Titulo de artigo'), ] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertTrue(pkg.is_valid_meta()) @@ -243,7 +245,7 @@ def test_is_valid_schema_with_valid_xml(self): ''')] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertTrue(pkg.is_valid_schema()) @@ -296,7 +298,7 @@ def test_is_valid_schema_with_invalid_xml(self): ''')] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertFalse(pkg.is_valid_schema()) @@ -350,7 +352,7 @@ def test_is_valid_schema_with_wrong_tag(self): ''')] - arch = self._make_test_archive(data) + arch = make_test_archive(data) pkg = self._makeOne(arch.name) self.assertFalse(pkg.is_valid_schema()) @@ -371,7 +373,7 @@ def test_non_zip_archive_raises_ValueError(self): self.assertRaises(ValueError, lambda: x_ray.Xray(fp.name)) def test_get_ext_returns_member_names(self): - arch = self._make_test_archive( + arch = make_test_archive( [('bar.xml', b'bar')]) xray = x_ray.Xray(arch.name) @@ -379,7 +381,7 @@ def test_get_ext_returns_member_names(self): self.assertEquals(xray.get_ext('xml'), ['bar.xml']) def test_get_ext_returns_empty_when_ext_doesnot_exist(self): - arch = self._make_test_archive( + arch = make_test_archive( [('bar.xml', b'bar')]) xray = x_ray.Xray(arch.name) @@ -387,7 +389,7 @@ def test_get_ext_returns_empty_when_ext_doesnot_exist(self): self.assertEquals(xray.get_ext('jpeg'), []) def test_get_fps_returns_an_iterable(self): - arch = self._make_test_archive( + arch = make_test_archive( [('bar.xml', b'bar')]) xray = x_ray.Xray(arch.name) @@ -396,7 +398,7 @@ def test_get_fps_returns_an_iterable(self): self.assertTrue(hasattr(fps, 'next')) def test_get_fpd_yields_ZipExtFile_instances(self): - arch = self._make_test_archive( + arch = make_test_archive( [('bar.xml', b'bar')]) xray = x_ray.Xray(arch.name) @@ -405,7 +407,7 @@ def test_get_fpd_yields_ZipExtFile_instances(self): self.assertIsInstance(fps.next(), zipfile.ZipExtFile) def test_get_fps_swallow_exceptions_when_ext_doesnot_exist(self): - arch = self._make_test_archive( + arch = make_test_archive( [('bar.xml', b'bar')]) xray = x_ray.Xray(arch.name) @@ -415,8 +417,8 @@ def test_get_fps_swallow_exceptions_when_ext_doesnot_exist(self): def test_package_checksum_is_calculated(self): data = [('bar.xml', b'bar')] - arch1 = self._make_test_archive(data) - arch2 = self._make_test_archive(data) + arch1 = make_test_archive(data) + arch2 = make_test_archive(data) self.assertEquals( x_ray.Xray(arch1.name).checksum, @@ -424,7 +426,7 @@ def test_package_checksum_is_calculated(self): ) def test_get_members(self): - arch = self._make_test_archive( + arch = make_test_archive( [('bar.xml', b'bar'), ('jar.xml', b'bar')]) @@ -433,14 +435,14 @@ def test_get_members(self): self.assertEquals(xray.get_members(), ['bar.xml', 'jar.xml']) def test_get_members_returns_empty(self): - arch = self._make_test_archive([]) + arch = make_test_archive([]) xray = x_ray.Xray(arch.name) self.assertEquals(xray.get_members(), []) def test_get_fp(self): - arch = self._make_test_archive( + arch = make_test_archive( [('bar.xml', b'bar'), ('jar.xml', b'bar')]) @@ -450,7 +452,7 @@ def test_get_fp(self): zipfile.ZipExtFile) def test_get_fp_nonexisting_members(self): - arch = self._make_test_archive( + arch = make_test_archive( [('bar.xml', b'bar'), ('jar.xml', b'bar')]) @@ -459,7 +461,7 @@ def test_get_fp_nonexisting_members(self): self.assertRaises(ValueError, lambda: xray.get_fp('foo.xml')) def test_get_classified_members(self): - arch = self._make_test_archive( + arch = make_test_archive( [('bar.xml', b'bar'), ('jar.xml', b'bar')]) @@ -468,7 +470,7 @@ def test_get_classified_members(self): self.assertEquals(xray.get_classified_members(), {'xml': ['bar.xml', 'jar.xml']}) def test_get_ext_is_caseinsensitive(self): - arch = self._make_test_archive( + arch = make_test_archive( [('bar.xml', b'bar'), ('jar.XML', b'bar')]) @@ -477,7 +479,7 @@ def test_get_ext_is_caseinsensitive(self): self.assertEquals(xray.get_ext('xml'), ['bar.xml', 'jar.XML']) def test_get_ext_arg_is_caseinsensitive(self): - arch = self._make_test_archive( + arch = make_test_archive( [('bar.xml', b'bar'), ('jar.XML', b'bar')]) @@ -486,7 +488,7 @@ def test_get_ext_arg_is_caseinsensitive(self): self.assertEquals(xray.get_ext('XML'), ['bar.xml', 'jar.XML']) def test_get_classified_members_is_caseinsensitive(self): - arch = self._make_test_archive( + arch = make_test_archive( [('bar.xml', b'bar'), ('jar.XML', b'bar')]) @@ -495,7 +497,7 @@ def test_get_classified_members_is_caseinsensitive(self): self.assertEquals(xray.get_classified_members(), {'xml': ['bar.xml', 'jar.XML']}) def test_get_fps_is_caseinsensitive(self): - arch = self._make_test_archive( + arch = make_test_archive( [('bar.xml', b'bar'), ('jar.XML', b'bar')]) @@ -505,7 +507,7 @@ def test_get_fps_is_caseinsensitive(self): self.assertEqual([fp.name for fp in fps], ['bar.xml', 'jar.XML']) def test_get_fps_arg_is_caseinsensitive(self): - arch = self._make_test_archive( + arch = make_test_archive( [('bar.xml', b'bar'), ('jar.XML', b'bar')]) @@ -514,3 +516,4 @@ def test_get_fps_arg_is_caseinsensitive(self): self.assertEqual([fp.name for fp in fps], ['bar.xml', 'jar.XML']) +