# Copyright (C) 2017 Semester.ly Technologies, LLC
#
# Semester.ly is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Semester.ly is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
import logging
from parsing.library.logger import JSONStreamWriter
from parsing.library.tracker import NullTracker
from parsing.library.validator import Validator
from parsing.library.viewer import Hoarder
from parsing.library.utils import (
clean,
make_list,
safe_cast,
titlize,
time24,
short_date,
)
from parsing.library.exceptions import PipelineError, PipelineWarning
from parsing.library.validator import (
ValidationError,
ValidationWarning,
MultipleDefinitionsWarning,
)
[docs]class IngestionError(PipelineError):
"""Ingestor error class."""
[docs]class IngestionWarning(PipelineWarning):
"""Ingestor warning class."""
[docs]class Ingestor(dict):
"""Ingest parsing data into formatted json.
Mimics functionality of dict.
Attributes:
ALL_KEYS (set): Set of keys supported by Ingestor.
break_on_error (bool): Break/cont on errors.
break_on_warning (bool): Break/cont on warnings.
school (str): School code (e.g. jhu, gw, umich).
skip_duplicates (bool): Skip ingestion for repeated definitions.
tracker (library.tracker): Tracker object.
UNICODE_WHITESPACE (TYPE): regex that matches Unicode whitespace.
validate (bool): Enable/disable validation.
validator (library.validator): Validator instance.
"""
ALL_KEYS = {
"school",
"school_subdivision_code",
"school_subdivision_name",
"kind",
"department",
"department_name",
"department_code",
"dept_name",
"dept_code",
"code",
"course_code",
"course",
"name",
"course_name",
"prerequisites",
"prereqs",
"corequisites",
"coreqs",
"exclusions",
"description",
"descr",
"areas",
"level",
"cores",
"geneds",
"homepage",
"website",
"instructors",
"instructors",
"instructor",
"instr",
"instrs",
"instr_name",
"instr_names",
"instructor",
"instructor_name",
"section",
"sections",
"section_code",
"section_name",
"meeting_section",
"section_type",
"type",
"term",
"semester",
"year",
"instructors",
"capacity",
"size",
"enrollment",
"enrolment",
"waitlist",
"waitlist_size",
"remaining_seats",
"fees",
"fee",
"cost",
"final_exam",
"offerings",
"meetings",
"time_start",
"start_time",
"time_end",
"end_time",
"date_start",
"date_end",
"location",
"loc",
"where",
"days",
"day",
"dates",
"date",
"time",
"credits",
"num_credits",
"campus", # TODO - not really
"score",
"summary",
"same_as",
"pos",
"writing_intensive",
"sub_school",
"course_section_id",
}
def __init__(
self,
config,
output,
break_on_error=True,
break_on_warning=False,
display_progress_bar=True,
skip_duplicates=True,
validate=True,
tracker=NullTracker(),
):
"""Construct ingestor object and resolve options.
Args:
school (string): The school code (e.g. jhu, gw, umich).
config (dict): Configuration dictionary.
output (str, file): Output path or file object.
break_on_error (bool, optional): Stop ingesting on error.
break_on_warning (bool, optional): Stop ingesting on warning.
display_progress_bar (bool, optional): display progress bar
skip_duplicates (bool, optional): Skip ingesting courses
that have already been seen.
validate (bool, optional): Perform validation.
tracker (library.tracker, optional): tracker object
"""
self.school = config["school"]["code"]
self.validate = validate
self.break_on_error = break_on_error
self.break_on_warning = break_on_warning
self.skip_duplicates = skip_duplicates
self.tracker = tracker
self.hoarder = Hoarder()
self.tracker.add_viewer(self.hoarder)
self.tracker.school = self.school
# Initialize loggers for json and errors.
self.json = JSONStreamWriter(output, type_=dict).enter()
self.data_list = self.json.write("$data", type_=list).enter()
if self.validate:
self.validator = Validator(config, tracker=self.tracker)
# Inherit dictionary functionality.
super(Ingestor, self).__init__()
def _get(self, *keys, **kwargs):
"""Match the first key found in self dictionary.
Note that this is purposefully not an override to __get__.
This allows the Ingestor to maintain dictionary-like
functionality for the API user while internally checking itself.
Args:
*keys: The list of keys.
**kwargs: default return option
TODO - Change if update to Python3
Returns:
The value of the key in the Ingestor instance.
Raises:
IngestorError: Enforce Ingestor.ALL_KEYS
"""
default = kwargs.get("default")
for key in keys:
if key not in Ingestor.ALL_KEYS:
raise IngestionWarning(key + " not in Ingestor.ALL_KEYS")
if key not in self:
continue
return self[key]
return default
def _resolve_department(self):
department = self._get("department")
if "department" not in self or (
"department_name" in self
or "department_code" in self
or "dept_name" in self
or "dept_code" in self
):
department = {
"name": titlize(self._get("department_name", "dept_name")),
"code": self._get("department_code", "dept_code"),
}
return department
def _resolve_instructors(self):
instructors = None
instr_keys = set(
[
"instructors",
"instructor",
"instr",
"instrs",
"instr_name",
"instr_names",
"instructor",
"instructor_name",
"instructors",
]
) & set(self)
if len(instr_keys) == 1:
instructors = self[list(instr_keys)[0]]
instructors = clean(make_list(instructors))
if instructors is not None:
for i in range(len(instructors)):
if isinstance(instructors[i], str):
instructors[i] = {"name": instructors[i]}
elif len(instr_keys) > 1:
raise IngestionWarning(
"cannot resolve instructors from keys: {}".format(",".join(instr_keys)),
self,
)
return instructors
def _resolve_date(self):
dates = self._get("date")
if "dates" not in self:
dates = {
"start": short_date(self._get("date_start")),
"end": short_date(self._get("date_end")),
}
return dates
def _resolve_time(self):
time = self._get("time")
if "time" not in self:
time = {
"start": time24(self._get("time_start", "start_time")),
"end": time24(self._get("time_end", "end_time")),
}
return time
def _resolve_location(self):
location = self._get("location")
if isinstance(self._get("location", "loc", "where"), str):
location = {"where": self._get("location", "loc", "where")}
return location
[docs] def ingest_course(self):
"""Create course json from info in model map.
Returns:
dict: course
"""
course = {
"kind": "course",
"school": {
"code": self.school,
"subdivisions": [
{
"code": self._get("school_subdivision_code"),
"name": self._get("school_subdivision_name"),
}
],
},
"code": self._get("course_code", "code", "course"),
"name": titlize(self._get("name", "course_name")),
"department": self._resolve_department(),
"credits": safe_cast(
self._get("credits", "num_credits"), float, default=0.0
),
"prerequisites": make_list(self._get("prerequisites", "prereqs")),
"corequisites": make_list(self._get("corequisites", "coreqs")),
"exclusions": make_list(self._get("exclusions")),
"areas": make_list(self._get("areas")),
"level": self._get("level"),
"cores": make_list(self._get("cores")),
"geneds": make_list(self._get("geneds")),
"sections": self._get("sections"),
"homepage": self._get("homepage", "website"),
"same_as": make_list(self._get("same_as")),
"description": self._get("description", "descr"),
"pos": make_list(self._get("pos")),
"writing_intensive": self._get("writing_intensive"),
"sub_school": self._get("sub_school"),
# 'description': extract_info_from_text(
# self._get('description', 'descr'),
# inject=self
# ),
}
course = clean(course)
self._validate_and_log(course)
if "department" in course:
self.tracker.department = course["department"]
return course
[docs] def ingest_section(self, course):
"""Create section json object from info in model map.
Args:
course (dict): validated course object
Returns:
dict: section
"""
section = {
"kind": "section",
"course": {"code": course.get("code")},
"code": self._get("section_code", "section", "meeting_section"),
"name": titlize(self._get("section_name")),
"term": self._get("term", "semester"),
"year": str(self._get("year")),
"instructors": self._resolve_instructors(),
"capacity": safe_cast(self._get("capacity", "size"), int),
"enrollment": safe_cast(self._get("enrollment", "enrolment"), int),
"waitlist": safe_cast(self._get("waitlist"), int),
"waitlist_size": safe_cast(self._get("waitlist_size"), int),
"remaining_seats": safe_cast(self._get("remaining_seats"), int),
"type": self._get("type", "section_type"),
"fees": safe_cast(self._get("fees", "fee", "cost"), float),
"final_exam": self._get("final_exam"),
"meetings": self._get("offerings", "meetings"),
"course_section_id": safe_cast(self._get("course_section_id"), int),
}
section = clean(section)
self._validate_and_log(section)
self.tracker.year = section["year"]
self.tracker.term = section["term"]
return section
[docs] def ingest_meeting(self, section, clean_only=False):
"""Create meeting ingested json map.
Args:
section (dict): validated section object
Returns:
dict: meeting
"""
year = str(self._get("year"))
term = self._get("term", "semester")
if section.get("code") is None:
year = None
term = None
meeting = {
"kind": "meeting",
"course": section.get("course"),
"section": {
"code": section.get("code"),
"year": year,
"term": term,
},
"days": make_list(self._get("days", "day")),
"dates": self._resolve_date(),
"time": self._resolve_time(),
"location": self._resolve_location(),
}
meeting = clean(meeting)
if clean_only:
return meeting
self._validate_and_log(meeting)
if "time" in meeting:
self.tracker.time = meeting["time"]["start"]
self.tracker.time = meeting["time"]["end"]
return meeting
[docs] def ingest_eval(self):
"""Create evaluation json object.
Returns:
dict: eval
"""
evaluation = {
"kind": "eval",
"year": str(self._get("year")),
"term": self._get("term"),
"score": float(self._get("score")),
"instructors": self._resolve_instructors(),
"course": {"code": self._get("course_code")},
"summary": self._get("summary"),
}
evaluation = clean(evaluation)
self._validate_and_log(evaluation)
self.tracker.year = evaluation["year"]
self.tracker.term = evaluation["term"]
return evaluation
[docs] def end(self):
"""Finish ingesting.
Close i/o, clear internal state, write meta info
"""
self.data_list.exit()
self.json.write(
"$meta",
{"$schools": self.hoarder.schools, "$timestamp": self.tracker.start_time},
)
self.json.exit()
self.clear()
def _validate_and_log(self, obj):
if self.validate is False:
self.data_list.write(obj)
self.tracker.stats = dict(kind=obj["kind"], status="total")
return
is_valid, skip = self._run_validator(obj)
if skip:
return
if is_valid:
self.data_list.write(obj)
try:
for key in self:
if key in Ingestor.ALL_KEYS:
continue
raise IngestionWarning(
self, "ingestor does not support key {}: {}".format(key, self[key])
)
except IngestionWarning as e:
is_valid = True
logging.exception("Ingestor warning")
if self.break_on_warning:
raise e
self.tracker.stats = dict(kind=obj["kind"], status="total")
def _run_validator(self, data):
is_valid = False
full_skip = False
logger = logging.getLogger("parsing.schools." + self.school)
try:
self.validator.validate(data)
self.tracker.stats = dict(kind=data["kind"], status="valid")
is_valid = True
except ValidationError as e:
if self.break_on_error:
raise ValidationError(*e.args)
else:
logger.warning("Ingestion failed", exc_info=True)
logger.debug("Ingestor dump", self)
except ValidationWarning as e:
if isinstance(e, MultipleDefinitionsWarning) and self.skip_duplicates:
full_skip = True
else:
is_valid = True
if self.break_on_warning:
raise ValidationWarning(*e.args)
else:
logger.warning("Validation warning", exc_info=True)
logger.debug("Ingestor dump", self)
return is_valid, full_skip