Source code for parsing.library.extractor

# Copyright (C) 2017 Semester.ly Technologies, LLC
#
# Semester.ly is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Semester.ly is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# NOTE: module currently unused as it introduces too many bugs.
#       Might reconsider for later use.

import re

# import unicodedata

from collections import namedtuple

from parsing.library.utils import make_list

Extraction = namedtuple("Extraction", "key container patterns")


[docs]def extract_info_from_text(
    text, inject=None, extractions=None, use_lowercase=True, splice_text=True
):
    """Attempt to extract info from text and put it into course object.

    NOTE: Currently unstable and unused as it introduces too many bugs.
          Might reconsider for later use.

    Args:
        text (str): text to attempt to extract information from
        extractions (None, optional): Description
        inject (None, optional): Description
        use_lowercase (bool, optional): Description

    Returns:
        str: the text trimmed of extracted information
    """
    # text = text.encode('utf-8', 'ignore')
    if extractions is None:
        extractions = (
            Extraction(
                key="prereqs",
                container=make_list,
                patterns=(
                    r"pr-?ereq(?:uisite)?s?[:,\s]\s*(.*?)(?:\.|$)\s*",
                    r"take (.*)\.?$",
                ),
            ),
            Extraction(
                key="coreqs",
                container=make_list,
                patterns=(r"co-?req(?:uisite)?s?[:,\s]\s*(.*?)(?:\.|$)\s*",),
            ),
            Extraction(key="geneds", container=make_list, patterns=(r"ge (.*)",)),
            Extraction(
                key="fee",
                container=float,
                patterns=(r"(?:lab )?fees?:?\s{1,2}?\$?\s?(\d+(?:\.\d{1,2})?)",),
            ),
        )

    # Search for matches.
    extracted = inject or {}
    for key, container, patterns in extractions:
        for pattern in patterns:
            match = re.search(pattern, text.lower() if use_lowercase else text)
            if not match:
                continue
            try:
                contained = container(
                    text[
                        match.start()
                        + match.group().index(match.group(1)) : match.start()
                        + match.group().index(match.group(1))
                        + len(match.group(1))
                    ]
                )  # magic...
                default = extracted.setdefault(key, container())
                default += contained
                if splice_text:
                    text = text[: match.start()] + text[match.end() :]
            except:
                continue
        # if isinstance(text, basestring):
        #     text = text.decode('utf-8')
        #     text = unicodedata.normalize('NFKD', text)

    if not inject:
        return text, extracted
    print(text)
    return text