Source code for school_scraper.fsdscraper

"""Scraper  for New Brunswick francophone school district website"""
import logging
import requests
import pandas as pd
from .fsdschool import FSDSchool
from .fsddistrict import FSDDistrict


[docs]class FSDScraper: """Interface for parsing HTML data loaded from the New Brunswick francophone school district website""" SCHEDULE_URL = \ "https://bp.nbed.nb.ca/notices/BPRFtbl.aspx?dst=dsfs&vtbl=1" def __init__(self, html): """ Args: html (str): HTML data loaded from the website. Is expected to contain a single HTML table containing rows describing each school in each school district """ assert FSDScraper.validate(html) temp = pd.read_html( html, header=0, converters={FSDSchool.MESSAGE_FIELD: str}) self._data = temp[0] self._data.fillna("", inplace=True) def __str__(self): return self._data.to_markdown()
[docs] @staticmethod def validate(html): """Checks to see if HTML loaded from the website is parseable Args: html (str): HTML data loaded from the district website Returns: bool: True if the HTML content was parseable, False if not. Details of any parsing errors are reported to the logger. """ log = logging.getLogger(__name__) try: temp = pd.read_html( html, header=0, converters={FSDSchool.MESSAGE_FIELD: str}, flavor="lxml") except ValueError as err: log.error("Error parsing HTML input:") log.error(err) log.debug(html) return False if len(temp) != 1: log.error(f"Expected 1 HTML table in the source data but " f"found {len(temp)} instead") return False data = temp[0] data.fillna("", inplace=True) log.debug("Parsed HTML data table:") log.debug(data.to_markdown()) school_names = list() for cur_school in data[FSDSchool.SCHOOL_FIELD]: if cur_school == "": log.error("Detected row with no valid school name") return False if cur_school in school_names: log.error(f"Multiple schools with the same name " f"detected: {cur_school}") return False school_names.append(cur_school) for cur_district in data[FSDDistrict.DISTRICT_FIELD]: if cur_district == "": log.error("Detected row with no valid district name") return False return True
@property def districts(self): """list (FSDDistrict): 0 or more districts parsed from the HTML content """ unique_names = self._data[FSDDistrict.DISTRICT_FIELD].str.lower().unique() retval = list() for cur_name in unique_names: rows = self._data[self._data[FSDDistrict.DISTRICT_FIELD].str.lower() == cur_name] retval.append(FSDDistrict(rows)) return retval
[docs] def get_district(self, name): """Gets a specific district from the HTML content Args: name (str): the name of the district to locate Returns: FSDDistrict: Reference to the district details for the named district, or None if no district with the given name exists """ for cur_district in self.districts: if cur_district.name.lower() == name.lower(): return cur_district return None
@property def district_names(self): """list (str): list of names of all districts parsed from the HTML""" retval = list() for cur_district in self.districts: retval.append(cur_district.name) return retval @property def school_names(self): """list (str): list of unique names of all schools in all districts""" retval = list() for cur_district in self.districts: for cur_school in cur_district.schools: retval.append(cur_school.name) return retval
if __name__ == "__main__": # pragma: no cover text = requests.get(FSDScraper.SCHEDULE_URL).text print(FSDScraper.validate(text)) obj = FSDScraper(text) print(obj)