Source code for parsing.library.validator

# Copyright (C) 2017 Technologies, LLC
# is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.

# TODO - consider something to load db field sizes into validator
#        However, that would ruin the purity of the adapter.

import dateutil.parser as dparser
import http.client
import jsonschema
import logging
import re
import simplejson as json

from django.conf import settings

from parsing.library.tracker import Tracker
from parsing.library.exceptions import PipelineError, PipelineWarning
from parsing.library.utils import DotDict, dir_to_dict, SimpleNamespace

[docs]class ValidationError(PipelineError): """Validator error class."""
[docs]class ValidationWarning(PipelineWarning): """Validator warning class."""
[docs]class MultipleDefinitionsWarning(ValidationWarning): """Duplicated key in data definition."""
[docs]class Validator: """Validation engine in parsing data pipeline. Attributes: config (:obj:`DotDict`): Loaded config.json. course_code_regex (:obj:`re`): Regex to match course code. kind_to_validation_function (:obj:`dict`): Map kind to validation function defined within this class. KINDS (:obj:`set`): Kinds of objects that validator validates. relative (:obj:`bool`): Enforce relative ordering in validation. seen (:obj:`dict`): Running monitor of seen courses and sections tracker (:obj:`parsing.library.tracker.Tracker`) """ KINDS = { "config", "datalist", "course", "section", "meeting", "directory", "eval", "instructor", "final_exam", } def __init__(self, config, tracker=None, relative=True): """Construct validator instance. Args: config (dict): School config dictionary. tracker (None, optional): Description relative (bool, optional): Enforce relative ordering in validation. """ Validator.load_schemas() self.kind_to_validation_function = { kind: getattr(self, "validate_" + kind) if hasattr(self, "validate_" + kind) else lambda *_, **__: None for kind in Validator.KINDS } # Running monitor of validated course and section codes. self.seen = {} self.config = DotDict(config) self.config["kind"] = "config" self.validate(self.config) self.course_code_regex = re.compile(self.config.course_code_regex) self.relative = relative if tracker is None: # Used during self-contained validation. self.tracker = Tracker() = self.tracker.mode = "validating" self.tracker.start() else: self.tracker = tracker
[docs] @classmethod def load_schemas(cls, schema_path=None): """Load JSON validation schemas. NOTE: Will load schemas as static variable (i.e. once per definition), unless schema_path is specifically defined. Args: schema_path (None, str, optional): Override default schema_path """ if hasattr(cls, "SCHEMAS") and schema_path is None: return if schema_path is None: schema_path = "{}/{}/library/schemas".format( settings.BASE_DIR, settings.PARSING_MODULE ) def load(kind): filepath = "{}/{}.json".format(schema_path, kind) with open(filepath, "r") as file: schema = json.load(file) resolved = jsonschema.RefResolver("file://{}/".format(schema_path), schema) return (schema, resolved) cls.SCHEMAS = DotDict({kind: load(kind) for kind in cls.KINDS})
# TODO - make into a namedtuple instead
[docs] @staticmethod def schema_validate(data, schema, resolver=None): """Validate data object with JSON schema alone. Args: data (dict): Data object to validate. schema: JSON schema to validate against. resolver (None, optional): JSON Schema reference resolution. Raises: jsonschema.exceptions.ValidationError: Invalid object. """ try: jsonschema.Draft4Validator(schema, resolver=resolver).validate(data) except jsonschema.exceptions.ValidationError as e: raise ValidationError(data, *e.args)
# TODO - Create iter_errors from jsonschema validator # NOTE: if modifying schemas it may be prudent to catch: # jsonschema.exceptions.SchemaError # jsonschema.exceptions.RefResolutionError
[docs] @staticmethod def file_to_json(path, allow_duplicates=False): """Load file pointed to by path into json object dictionary. Args: path (str): allow_duplicates (bool, optional): Allow duplicate keys in JSON. Returns: dict: JSON-compliant dictionary. """ def raise_on_duplicates(ordered_pairs): """Reject duplicate keys in dictionary.""" d = {} for k, v in ordered_pairs: if k in d: raise ValidationError("duplicate key: %r" % (k,)) d[k] = v return d with open(path, "r") as f: if allow_duplicates: return json.load(f) return json.load(f, object_pairs_hook=raise_on_duplicates)
[docs] def validate(self, data, transact=True): """Validation entry/dispatcher. Args: data (list, dict): Data to validate. """ if transact: self.transaction = SimpleNamespace(key=None, values=set()) data = DotDict(data) Validator.schema_validate(data, *Validator.SCHEMAS[data.kind]) self.kind_to_validation_function[data.kind](data) if transact and self.transaction.key: self.seen.setdefault(self.transaction.key, set()).update( self.transaction.values )
[docs] def validate_self_contained( self, data_path, break_on_error=True, break_on_warning=False, output_error=None, display_progress_bar=True, master_log_path=None, ): """Validate JSON file as without ingestor. Args: data_path (str): Path to data file. break_on_error (bool, optional): Description break_on_warning (bool, optional): Description output_error (None, optional): Error output file path. display_progress_bar (bool, optional): Description master_log_path (None, optional): Description break_on_error (bool, optional) break_on_warning (bool, optional) display_progress_bar (bool, optional) Raises: ValidationError: Description """ data = Validator.file_to_json(data_path)["$data"] # Validator.schema_validate(data, *Validator.SCHEMAS.datalist) for obj in map(DotDict, data): try: self.validate(obj) self.tracker.stats = dict(kind=obj.kind, status="valid") except ValidationError as e: logging.exception("Validation error") if break_on_error: raise ValidationError(*e.args) except ValidationWarning as e: logging.warn(e) # warnings.warn('', e, stacklevel=2) self.tracker.stats = dict(kind=obj.kind, status="total") # TODO - this should be handled by caller self.tracker.end()
[docs] def validate_course(self, course): """Validate course. Args: course (DotDict): Course object to validate. Raises: MultipleDefinitionsWarning: Course has already been validated in same session. ValidationError: Invalid course. """ if "kind" in course and course.kind != "course": raise ValidationError(course, "course object must be of kind course") if "school" in course and != raise ValidationError(course, "course schools does not match config") if self.course_code_regex.match(course.code) is None: raise ValidationError( course, "course code {} does not match r'{}'".format( course.code, self.config.course_code_regex ), ) if ( "department" in course and "code" in course.department and "departments" in self.config ): department_codes = {d.code for d in self.config.departments} if course.department.code not in department_codes: raise ValidationError( course, "department {} is not in config.json departments".format( course.department ), ) if "homepage" in course: self.validate_website(course.homepage) for sa in course.get("same_as", []): if self.course_code_regex.match(sa) is not None: continue # FIXME -- should still do this check but it breaks due to the course not being written # raise ValidationWarning( # course, # "same as course code {} does not match r'{}'".format( # course.code, # self.config.course_code_regex # ) # ) if self.relative: if course.code in self.seen: raise MultipleDefinitionsWarning( course, "multiple definitions of course {}".format(course.code) ) self.transaction.key = course.code for section in course.get("sections", []): if "course" in section and section["course"]["code"] != course.code: raise ValidationError( course, "nested {} does not match parent {}".format( section["course"]["code"], course.code ), ) # NOTE: mutating dictionary section["course"] = {"code": course.code} section["kind"] = "section" self.validate(DotDict(section), transact=False)
[docs] def validate_section(self, section): """Validate section object. Args: section (DotDict): Section object to validate. Raises: MultipleDefinitionsWarning: Invalid section. ValidationError: Description """ if "course" not in section: raise ValidationError(section, "section doesnt define a parent course") if "kind" in section and section.kind != "section": raise ValidationError(section, "section must be of kind section") if ( "course" in section and self.course_code_regex.match(section.course.code) is None ): raise ValidationError( section, "course code {} does not match r'{}'".format( section.course.code, self.config.course_code_regex ), ) if "term" in section and section.term not in self.config.terms: raise ValidationError( section, "term {} not in config.json term list".format(section.term) ) if "instructors" in section: db_instructor_textfield_max_size = 500 instructor_textfield = "" for instructor in section.get("instructors", []): instructor = DotDict(instructor) if isinstance(, str): instructor_textfield += elif isinstance(, dict): instructor_textfield += "{} {}".format(, ) db_instructor_textfield_size = len(instructor_textfield) if db_instructor_textfield_size > db_instructor_textfield_max_size: raise ValidationError( section, "db field too small for comma-joined instructor names" ) for instructor in section.get("instructors", []): self.validate_instructor(instructor) if "final_exam" in section: if ( "course" in section.final_exam and section.final_exam.course.code != section.course.code ): raise ValidationError( section, "final exam course {} doesnt match course code {}".format( section.final_exam.course.code, section.course.code ), ) if ( "section" in section.final_exam and section.final_exam.section.code != section.code ): raise ValidationError( section, "final exam section {} doesnt match section {}".format( section.final_exam.section.code, section.code ), ) # final_exam['course'] = section.course # final_exam['section'] = {'code': section.code} # self.validate_final_exam(section.final_exam) if self.relative: if ( section.course.code not in self.seen and self.transaction.key != section.course.code ): raise ValidationError( "course code {} isnt defined".format(section.course.code), section ) elif (section.code, section.year, section.term) in self.seen.get( section.course.code, set() ) | self.transaction.values: raise MultipleDefinitionsWarning( section, "multiple defs for {} {} - {} already defined".format( section.course.code, section.code, section.year ), ) self.transaction.key = section.course.code self.transaction.values.add((section.code, section.year, section.term)) for meeting in section.get("meetings", []): meeting = DotDict(meeting) if "course" in meeting and meeting.course.code != section.course.code: raise ValidationError( section, "course code {} in meeting doesnt match parent section \ course code {}".format( meeting.course.code, section.course.code ), ) if "section" in meeting and meeting.section.code != section.code: raise ValidationError( section, "section code {} in nested meeting doesnt match parent \ section code {}".format( meeting.section.code, section.code ), ) # NOTE: mutating obj meeting["course"] = section.course meeting["section"] = { "code": section.code, "year": section.year, "term": section.term, } meeting["kind"] = "meeting" self.validate(DotDict(meeting), transact=False)
[docs] def validate_meeting(self, meeting): """Validate meeting object. Args: meeting (DotDict): Meeting object to validate. Raises: ValidationError: Invalid meeting. ValidationWarning: Description """ if "kind" in meeting and meeting.kind != "meeting": raise ValidationError(meeting, "meeting object must be kind instructor") if ( "course" in meeting and self.course_code_regex.match(meeting.course.code) is None ): raise ValidationError( meeting, "course code {} does not match regex '{}'".format( meeting.course.code, self.config.course_code_regex ), ) if "time" in meeting: try: self.validate_time_range(meeting.time.start, meeting.time.end) except (ValidationError, ValidationWarning) as e: message = "meeting for {} {}, ".format( meeting.course.code, meeting.section.code ) if isinstance(e, ValidationError): raise ValidationError(message, *e.args) raise ValidationWarning(message, *e.args) if "location" in meeting: try: self.validate_location(meeting.location) except ValidationError as e: message = "meeting for {} {}, ".format( meeting.course.code, meeting.section.code ) raise ValidationError(message, *e.args) if not self.relative: return if ( "course" in meeting and meeting.course.code not in self.seen and self.transaction is None ): raise ValidationError( meeting, "course code {} isnt defined".format(meeting.course.code) ) if "section" not in meeting: return if ( meeting.section.code, meeting.section.year, meeting.section.term, ) not in self.seen.get(meeting.course.code, set()) | self.transaction.values: raise ValidationError( meeting, "section {} isnt defined".format(meeting.section.code) )
[docs] def validate_eval(self, course_eval): """Validate evaluation object. Args: course_eval (DotDict): Evaluation to validate. Raises: ValidationError: Invalid evaulation. """ if self.course_code_regex.match(course_eval.course.code) is None: raise ValidationError( course_eval, "course code {} does not match r'{}'".format( course_eval.course.code, self.config.course_code_regex ), )
[docs] def validate_instructor(self, instructor): """Validate instructor object. Args: instructor (DotDict): Instructor object to validate. Raises: ValidationError: Invalid instructor. """ if "kind" in instructor and instructor.kind != "instructor": raise ValidationError( instructor, "instructor object must be of kind instructor" ) for class_ in instructor.get("classes", []): if ( "course" in class_ and self.course_code_regex.match(class_.course.code) is None ): raise ValidationError( instructor, "course code {} does not match given regex {}".format( class_.course.code, self.config.course_code_regex ), ) if "department" in instructor and "departments" in self.config: dept_codes = {d.code for d in self.config.departments} if instructor.department not in dept_codes: raise ValidationError( instructor, "department {} not listed in config.json".format( instructor.department ), ) if "homepage" in instructor: try: self.validate_homepage(instructor.homepage) except ValidationError as e: message = "instructor {} office, {}".format( raise ValidationError(message, *e.args) if "office" in instructor: try: if "location" in self.validate_location( for office_hour in"hours", []): self.validate_meeting(office_hour) except ValidationError as e: message = "instructor {} office, {}".format( raise ValidationError(message, *e.args)
[docs] def validate_final_exam(self, final_exam): """Validate final exam. NOTE: currently unused. Args: final_exam (DotDict): Final Exam object to validate. Raises: ValidationError: Invalid final exam. """ if "kind" in final_exam and final_exam.kind != "final_exam": raise ValidationError( final_exam, 'final_exam object must be of kind "final_exam"' ) try: self.validate_meeting(final_exam.meeting) except ValidationError as e: raise ValidationError(final_exam, *e.args)
[docs] def validate_location(self, location): """Validate location. Args: location (DotDict): Location object to validate. Raises: ValidationWarning: Invalid location. """ if "campus" in location and "campuses" in self.config: if location.campus not in self.config.campuses: raise ValidationWarning( location, "campus {} not in config".format(location.campus), ) if "building" in location and "buildings" in self.config: if location.building not in self.config.buildings: raise ValidationWarning( location, "building {} not in config".format(location.building), )
[docs] @staticmethod def validate_website(url): """Validate url by sending HEAD request and analyzing response. Args: url (str): URL to validate. Raises: ValidationError: URL is invalid. """ c = http.client.HTTPConnection(url) c.request("HEAD", "") # NOTE: 200 - good status # 301 - redirected if c.getresponse().status == 200 or c.getresponse().status == 301: return raise ValidationError(url, 'invalid website w/url "%s"'.format(url))
[docs] def validate_time_range(self, start, end): """Validate start time and end time. There exists an unhandled case if the end time is midnight. Args: start (str): Start time. end (str): End time. Raises: ValidationError: Time range is invalid. """ try: start, end = list(map(dparser.parse, [start, end])) except ValueError: raise ValidationError("invalid time format {}-{}".format(start, end)) if start > end: raise ValidationError("start {} > end {}".format(start, end)) elif start == end: pass # TODO - this should be reported
# raise ValidationWarning('start {} = end {}'.format(start, end)) # NOTE: there exists an unhandled case if the end time is midnight.
[docs] def validate_directory(self, directory): """Validate directory. Args: directory (str, dict): Directory to validate. May be either path or object. Raises: ValidationError: encapsulated IOError """ if isinstance(directory, str): try: name = directory directory = dir_to_dict(directory) directory["name"] = name except IOError as e: raise ValidationError(str(e)) Validator.schema_validate(directory, *