Source code for parsing.library.extractor

# Copyright (C) 2017 Semester.ly Technologies, LLC
#
# Semester.ly is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Semester.ly is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# NOTE: module currently unused as it introduces too many bugs.
#       Might reconsider for later use.

import re

# import unicodedata

from collections import namedtuple

from parsing.library.utils import make_list

Extraction = namedtuple("Extraction", "key container patterns")


[docs]def extract_info_from_text( text, inject=None, extractions=None, use_lowercase=True, splice_text=True ): """Attempt to extract info from text and put it into course object. NOTE: Currently unstable and unused as it introduces too many bugs. Might reconsider for later use. Args: text (str): text to attempt to extract information from extractions (None, optional): Description inject (None, optional): Description use_lowercase (bool, optional): Description Returns: str: the text trimmed of extracted information """ # text = text.encode('utf-8', 'ignore') if extractions is None: extractions = ( Extraction( key="prereqs", container=make_list, patterns=( r"pr-?ereq(?:uisite)?s?[:,\s]\s*(.*?)(?:\.|$)\s*", r"take (.*)\.?$", ), ), Extraction( key="coreqs", container=make_list, patterns=(r"co-?req(?:uisite)?s?[:,\s]\s*(.*?)(?:\.|$)\s*",), ), Extraction(key="geneds", container=make_list, patterns=(r"ge (.*)",)), Extraction( key="fee", container=float, patterns=(r"(?:lab )?fees?:?\s{1,2}?\$?\s?(\d+(?:\.\d{1,2})?)",), ), ) # Search for matches. extracted = inject or {} for key, container, patterns in extractions: for pattern in patterns: match = re.search(pattern, text.lower() if use_lowercase else text) if not match: continue try: contained = container( text[ match.start() + match.group().index(match.group(1)) : match.start() + match.group().index(match.group(1)) + len(match.group(1)) ] ) # magic... default = extracted.setdefault(key, container()) default += contained if splice_text: text = text[: match.start()] + text[match.end() :] except: continue # if isinstance(text, basestring): # text = text.decode('utf-8') # text = unicodedata.normalize('NFKD', text) if not inject: return text, extracted print(text) return text