Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 119 additions & 0 deletions packtools/stylechecker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import re
import os
import logging

from lxml import etree

from packtools.utils import setdefault


HERE = os.path.dirname(os.path.abspath(__file__))
SCHEMAS = {
'SciELO-journalpublishing1.xsd': os.path.join(HERE, 'sps_xsd', 'sps.xsd'),
}
EXPOSE_ELEMENTNAME_PATTERN = re.compile(r"(?<=Element )'.*?'")

logger = logging.getLogger(__name__)


def XMLSchema(schema_name):
with open(SCHEMAS[schema_name]) as fp:
xmlschema_doc = etree.parse(fp)

xmlschema = etree.XMLSchema(xmlschema_doc)
return xmlschema


class XML(object):
def __init__(self, file):
"""
:param file: Path to the XML file or etree.
"""
if isinstance(file, etree._ElementTree):
self.lxml = file
else:
self.lxml = etree.parse(file)

self.xmlschema = XMLSchema('SciELO-journalpublishing1.xsd')

def find(self, tagname, lineno):
for elem in self.lxml.findall('//' + tagname):
if elem.sourceline == lineno:
logger.debug('method *find*: hit a regular element: %s.' % tagname)
return elem
else:
root = self.lxml.getroot()
if root.tag == tagname:
logger.debug('method *find*: hit a root element.')
return root


def validate(self):
result = setdefault(self, '__validation_result', lambda: self.xmlschema.validate(self.lxml))
errors = setdefault(self, '__validation_errors', lambda: self.xmlschema.error_log)
return result, errors

def annotate_errors(self):
result, errors = self.validate()

for error in errors:
match = EXPOSE_ELEMENTNAME_PATTERN.search(error.message)
if match is None:
raise ValueError('Could not locate the element name in %s.' % error.message)
else:
element_name = match.group(0).strip("'")

err_element = self.find(element_name, error.line)
if err_element is None:
raise ValueError('Could not locate the erratic element %s at line %s to annotate: %s.' % (element_name, error.line, error.message))

notice_element = etree.Element('SPS-ERROR')
notice_element.text = error.message
try:
err_element.addprevious(notice_element)
except TypeError:
# In case of a root element, a comment if added.
err_element.addprevious(etree.Comment('SPS-ERROR: %s' % error.message))

def __str__(self):
return etree.tostring(self.lxml, pretty_print=True,
encoding='utf-8', xml_declaration=True)

def __unicode__(self):
return str(self).decode('utf-8')

def __repr__(self):
return '<packtools.stylechecker.XML xml=%s valid=%s>' % (self.lxml, self.validate()[0])

def read(self):
"""
Read the XML contents as text.
"""
return unicode(self)


if __name__ == '__main__':
import argparse
import sys

parser = argparse.ArgumentParser(description='stylechecker cli utility.')
parser.add_argument('--annotated', action='store_true')
parser.add_argument('xmlpath', help='Absolute or relative path to the XML file.')

args = parser.parse_args()
xml = XML(args.xmlpath)

is_valid, errors = xml.validate()

if args.annotated:
xml.annotate_errors()
sys.stdout.write(str(xml))

else:
if not is_valid:
print 'Invalid XML! Found %s errors:' % len(errors)
for err in errors:
print '%s,%s\t%s' % (err.line, err.column, err.message)
else:
print 'Valid XML! ;)'

10 changes: 10 additions & 0 deletions packtools/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,13 @@ def checksum_file(filepath, callable):

return hash.hexdigest()


def setdefault(object, attribute, producer):
"""
Like dict().setdefault but for object attributes.
"""
if not hasattr(object, attribute):
setattr(object, attribute, producer())

return getattr(object, attribute)

27 changes: 12 additions & 15 deletions packtools/xray.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,13 @@
from lxml import etree

from . import utils
from . import stylechecker


logger = logging.getLogger(__name__)


def get_xmlschema(path):
xmlschema_doc = etree.parse(open(path, 'r'))
xmlschema = etree.XMLSchema(xmlschema_doc)
return xmlschema


class SPSMixin(object):
xmlschema = get_xmlschema(os.path.join(
os.path.dirname(os.path.abspath(__file__)), 'sps_xsd', 'sps.xsd'))

@property
def xmls(self):
Expand Down Expand Up @@ -56,6 +49,13 @@ def meta(self):

return dct_mta

def is_valid_schema(self):
"""
Checks if the XML is valid against SPS XSD.
More info at: https://github.com/scieloorg/scielo_publishing_schema
"""
return self.stylechecker.validate()[0]

def is_valid_meta(self):
"""
Checks if the minimum required data to identify a package is present.
Expand All @@ -65,13 +65,6 @@ def is_valid_meta(self):
meta['journal_eissn'] or meta['journal_pissn']) and (
meta['issue_volume'] or meta['issue_number']))

def is_valid_schema(self):
"""
Checks if the XML is valid against SPS XSD.
More info at: https://github.com/scieloorg/scielo_publishing_schema
"""
return self.xmlschema.validate(self.xml)

def is_valid_package(self):
"""
Validate if exist at least one XML file and one PDF file
Expand All @@ -91,6 +84,10 @@ def is_valid(self):
"""
return self.is_valid_package() and self.is_valid_schema() and self.is_valid_meta()

@property
def stylechecker(self):
return utils.setdefault(self, '__stylechecker', lambda: stylechecker.XML(self.xml))


class Xray(object):

Expand Down
144 changes: 144 additions & 0 deletions tests/test_stylechecker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#coding: utf-8
import unittest
from StringIO import StringIO
from tempfile import NamedTemporaryFile

from lxml import etree

from packtools import stylechecker


# valid: <a><b></b></a>
# invalid: anything else
sample_xsd = StringIO('''\
<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<xsd:element name="a" type="AType"/>
<xsd:complexType name="AType">
<xsd:sequence>
<xsd:element name="b" type="xsd:string" />
</xsd:sequence>
</xsd:complexType>
</xsd:schema>
''')


def setup_tmpfile(method):
def wrapper(self):
valid_tmpfile = NamedTemporaryFile()
valid_tmpfile.write(b'<a><b>bar</b></a>')
valid_tmpfile.seek(0)
self.valid_tmpfile = valid_tmpfile

method(self)

self.valid_tmpfile.close()
return wrapper


class XMLTests(unittest.TestCase):

@setup_tmpfile
def test_initializes_with_filepath(self):
self.assertTrue(stylechecker.XML(self.valid_tmpfile.name))

def test_initializes_with_etree(self):
fp = StringIO(b'<a><b>bar</b></a>')
et = etree.parse(fp)

self.assertTrue(stylechecker.XML(et))

def test_validation(self):
fp = etree.parse(StringIO(b'<a><b>bar</b></a>'))
xml = stylechecker.XML(fp)
xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd))

result, errors = xml.validate()
self.assertTrue(result)
self.assertFalse(errors)

def test_invalid(self):
fp = etree.parse(StringIO(b'<a><c>bar</c></a>'))
xml = stylechecker.XML(fp)
xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd))

result, _ = xml.validate()
self.assertFalse(result)

def test_invalid_errors(self):
# Default lxml error log.
fp = etree.parse(StringIO(b'<a><c>bar</c></a>'))
xml = stylechecker.XML(fp)
xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd))

_, errors = xml.validate()
self.assertIsInstance(errors, etree._ListErrorLog)

def test_find(self):
fp = etree.parse(StringIO(b'<a>\n<b>bar</b>\n</a>'))
xml = stylechecker.XML(fp)
xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd))

elem = xml.find('b', 2)
self.assertEqual(elem.tag, 'b')
self.assertEqual(elem.sourceline, 2)

def test_find_root_element(self):
fp = etree.parse(StringIO(b'<a>\n<b>bar</b>\n</a>'))
xml = stylechecker.XML(fp)
xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd))

elem = xml.find('a', 1)
self.assertEqual(elem.tag, 'a')
self.assertEqual(elem.sourceline, 1)

def test_find_missing(self):
fp = etree.parse(StringIO(b'<a>\n<b>bar</b>\n</a>'))
xml = stylechecker.XML(fp)
xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd))

self.assertIsNone(xml.find('c', 2))

def test_annotate_errors(self):
fp = etree.parse(StringIO(b'<a><c>bar</c></a>'))
xml = stylechecker.XML(fp)
xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd))

xml.annotate_errors()
self.assertIn("<SPS-ERROR>Element 'c': This element is not expected. Expected is ( b ).</SPS-ERROR>", str(xml))

def test_annotate_errors(self):
fp = etree.parse(StringIO(b'<a><c>bar</c></a>'))
xml = stylechecker.XML(fp)
xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd))

xml.annotate_errors()
xml_text = xml.read()

self.assertIn("<SPS-ERROR>Element 'c': This element is not expected. Expected is ( b ).</SPS-ERROR>", xml_text)
self.assertTrue(isinstance(xml_text, unicode))


class ElementNamePatternTests(unittest.TestCase):
pattern = stylechecker.EXPOSE_ELEMENTNAME_PATTERN

def test_case1(self):
message = "Element 'article', attribute 'dtd-version': [facet 'enumeration'] The value '3.0' is not an element of the set {'1.0'}."
self.assertEqual(self.pattern.search(message).group(0), "'article'")


def test_case2(self):
message = "Element 'article', attribute 'dtd-version': '3.0' is not a valid value of the local atomic type."
self.assertEqual(self.pattern.search(message).group(0), "'article'")

def test_case3(self):
message = "Element 'author-notes': This element is not expected. Expected is one of ( label, title, ack, app-group, bio, fn-group, glossary, ref-list, notes, sec )."
self.assertEqual(self.pattern.search(message).group(0), "'author-notes'")

def test_case4(self):
message = "Element 'journal-title-group': This element is not expected. Expected is ( journal-id )."
self.assertEqual(self.pattern.search(message).group(0), "'journal-title-group'")

def test_case5(self):
message = "Element 'contrib-group': This element is not expected. Expected is one of ( article-id, article-categories, title-group )."
self.assertEqual(self.pattern.search(message).group(0), "'contrib-group'")

Loading