bd_www.report

Functions to create site reports.

Based upon a specific scrape -identified by its timestamp- reports can be generated with the site_report function. All other functions are supportive to this main function and are not meant to be used separately.

View Source
"""
***Functions to create site reports.***

Based upon a specific scrape -identified by its timestamp- reports can be
generated with the `site_report` function. All other functions are
supportive to this main function and are not meant to be used separately.

"""

import datetime as dt
import difflib
import json
import logging
import openpyxl
import re
import shutil
import string
import time
import xlsxwriter

from operator import itemgetter
from pathlib import Path
from typing import Union
from urllib.parse import urlparse

from bd_www import report_conf, mst_conn, mst_dir, Scrape
from bd_www.constants import REPORT_VERSION, ROOT_URL, DL_SITE


# Constants
REPORT_SUBDIRS = {
    'day': 'day_reports',
    'daily': 'daily_reports',
    'weekly': 'weekly_reports',
    'monthly': 'monthly_reports',
    'one-monthly': 'monthly_reports',
    'quarterly': 'three-monthly_reports',
    'three-monthly': 'three-monthly_reports',
    'half-yearly': 'six-monthly_reports',
    'six-monthly': 'six-monthly_reports',
    'yearly': 'twelve-monthly_reports',
    'twelve-monthly': 'twelve-monthly_reports'
}
"Dictionary where reports will be stored, with report periods as keys and " \
    "subdirectories as values."
_CELL_MAX = 32500
"Maximum number of characters per worksheet cell."
_REP_REL_NOTES = [
    'Release notes version 2.0.1',
    '- no changes of report content (only technical changes)',
    ' ',
    'Release notes version 2.0.0',
    '- added \'Downloads\' sheet with all activated download links for the ',
    '     period, sorted by descending downloads per link',
    '     (to avoid green error triangles in cells, deselect next option: ',
    '         File > Options > Formulas > Error checking rules',
    '         > Numbers formatted as text or preceded by an apostrophe)',
    '- added column \'pagetype\' to \'Editorial links\' sheet',
    '- changed column order of \'Editorial links\' sheet',
    ' ',
    'Release notes version 1.9.1',
    '- no changes of report content (only technical changes)',
    ' ',
    'Release notes version 1.9.0',
    '- new key figures: satisfaction rate for period and year-to-date',
    '- non integer values now have three decimals (was two)',
    '- column name \'Final wcm path\' changed to \'Url or final path\' in ',
    '  \'Redirects and aliases\' sheet',
    '- column name \'Link id\' changed to \'Page id of link\' in ',
    '  \'Editorial links\' sheet',
    '- full and public report versions now have different colours',
    ' ',
    'Release notes version 1.8.4',
    '- textual feedback removed to ensure data privacy; full report can be',
    '     applied for on functional need basis',
    ' ',
    'Release notes version 1.8.3',
    '- bug fixed in retrieval of the segmented metrics; especially the ',
    '     metrics \'Call visits\' and \'Call rate\' are not to be trusted ',
    '     in previous report versions',
    ' ',
    'Release notes version 1.8.2',
    '- filtering/cleansing textual feedback improved and also applied to ',
    '     \'All feedback\' sheet',
    '- added \'H1 words\' column to \'New/All pages\' sheets',
    ' ',
    'Release notes version 1.8.1',
    '- several improvements for textual user feedback:',
    '     - leading whitespace removed',
    '     - meaningless feedback filtered out',
    '     - sorted case-insensitive and alphabetic before non-alphabetic',
    '- truncated textual feedbacks in the \'Removed/New/All pages\' sheets ',
    '     that exceeded the cell limit',
    '- added \'All feedback\' sheet with textual feedback for the period,',
    '     sorted by descending feedbacks per page',
    '- \'All pages\' sheet sorted by descending number of visits',
    '- added two segmented metrics and their corresponding rates to the ',
    '     \'Removed/New/All pages\' sheets:',
    '     - \'Organic entries\' and \'Org entry rate\': number and fraction ',
    '       of visits that originated from an external search engine',
    '     - \'Call visits\' and \'Call rate\': number and fraction of visits ',
    '       that included this page as well as a page with calling information',
    '- added \'Data legend\' sheet with descriptions of the various data ',
    '     fields and terms',
    '- truncated these release notes to the last three major versions'
]
"Combined release notes to show in each report."

logger = logging.getLogger()


class ReportWorkbook(xlsxwriter.Workbook):
    """
    **Workbook subclass for writing site reports.**

    Instantiating from this class creates a site report (workbook) with
    one sheet named 'Report'. This sheet contains the info that was given when
    instantiating, together with release notes of the site report.

    ***Instance methods:***

    - `get_fmt`: get a format to use when writing cells in a sheet
    - `add_datasheet`: add a sheet with tabular data
    - `add_buttons`: add navigation buttons to the 'Report' sheet
    - `close_and_publish`: call finalisation code, close file and publish it

    ***Instance attributes:***

    - `rep_fmts`: format library to be used when writing data to report sheets
      using the `add_datasheet` method; this library can be accessed using
      the `get_fmt` instance method

    The formats in the `rep_fmts` library can be accessed via three-level
    indexing with respective values for `num_format`, `alignment` and
    `shading`. While `shading` is boolean, the potential values for the
    `num_format` and `alignment` are given below, and can be used as values
    for the *cols* column specification in the report configuration file.

    ***Values for `num_format`***

    - 'std': 'General' format according to the spreadsheet application
    - 'id':  plain integer
    - 'int': integer with thousands separators
    - 'int-delta': +/- signed integer with thousands separators
    - 'flt3': float with three decimals
    - 'flt3-delta': +/- signed float with three decimals
    - 'pct0': percentage without decimals
    - 'pct2': percentage with two decimals
    - 'hdr': format for cells with column headers

    Two additional formats can be used that depend on the type of value to be
    written in a cell:

    - 'int/flt3': 'int' for integer, 'flt3' for float
    - 'int-delta/flt3-delta': 'int-delta' for integer, 'flt3-delta' for float

    ***Values for `alignment`***

    - 'std': default alignment according to the spreadsheet application
    - 'left': left aligned
    - 'center': center aligned
    - 'right': right aligned
    """

    def __init__(self, wb_path: Path, rep_info: list[tuple]) -> None:
        """
        **Create and initialise a report workbook.**

        Arguments:

            wb_path: path of the report workbook to be created
            rep_info: name/values pairs to be written to the 'Report' sheet

        Returns:

            instance of ReportWorkbook

        A new workbook is created with one sheet named 'Report', containing
        report information given via the `rep_info` parameter, and release
        notes.
        """
        super().__init__(wb_path, {'constant_memory': True})

        # Store all potential formats to write the sheets
        self.rep_fmts = {}
        f0 = {'border_color': report_conf.colour_brdr, 'left': 1, 'right': 1}
        for num_fmt in ('std', 'id', 'int', 'int-delta', 'flt3',
                        'flt3-delta', 'pct0', 'pct2', 'hdr'):
            self.rep_fmts[num_fmt] = {}
            f1 = {}
            match num_fmt:
                case 'std':
                    f1 = {'num_format': 'General'}
                case 'id':
                    f1 = {'num_format': '0'}
                case 'int':
                    f1 = {'num_format': '#,##0'}
                case 'int-delta':
                    f1 = {'num_format': '+#,##0;-#,##0;-'}
                case 'flt3':
                    f1 = {'num_format': '0.000'}
                case 'flt3-delta':
                    f1 = {'num_format': '0.000'}
                case 'pct0':
                    f1 = {'num_format': '0%'}
                case 'pct2':
                    f1 = {'num_format': '0.00%'}
                case 'hdr':
                    f1 = {'bold': True, 'font_color': 'white',
                          'fg_color': report_conf.colour_hdr_bg,
                          'border_color': 'white'}
            for align in ('std', 'left', 'center', 'right'):
                self.rep_fmts[num_fmt][align] = {}
                f2 = {} if align == 'std' else {'align': align}
                for shade in (False, True):
                    f3 = {'fg_color': report_conf.colour_shade} if shade else {}
                    self.rep_fmts[num_fmt][align][shade] = \
                        self.add_format(f0 | f1 | f2 | f3)

        # Insert 'Report' worksheet and add report information and release notes
        ws, (row, col) = self.add_datasheet('Report', rep_info,
                                            shading='number')
        max_length = max([len(n) for n in _REP_REL_NOTES])
        ws.insert_textbox(
            row + 3, 0, '\n'.join(_REP_REL_NOTES),
            {'x_offset': 20,
             'object_position': 3,
             'line': {'color': report_conf.colour_hdr_bg, 'width': 1},
             'fill': {'color': report_conf.colour_shade},
             'width': max_length * 6.4,
             'height': len(_REP_REL_NOTES) * 18 + 30,
             'align': {'vertical': 'middle'}}
        )

    def get_fmt(self, num_format: str, alignment: str = 'std',
                shaded: bool = False) -> xlsxwriter.workbook.Format:
        """
        **Get format to write report cells.**

        Arguments:

            num_format: specification of the required number format
            alignment: specification of the required alignment
            shaded: True if the cell should have a slightly darker background

        Returns:

            format to be used when writing worksheet cells

        Potential values of the `num_format` and `alignment` parameters are
        documented in the `ReportWorkbook` class.
        """
        return self.rep_fmts[num_format][alignment][shaded]

    def add_datasheet(self, sh_name: str, data: list[Union[tuple, list]],
                      incl_txt_fb: bool = True, shading: str = None,
                      cluster_col: int = 0, cluster_del: bool = False) \
            -> tuple[xlsxwriter.workbook.Worksheet, tuple[int, int]]:
        """
        **Add a data sheet to the report workbook.**

        Arguments:

            sh_name: name of the sheet
            data: two-dimensional data structure with rows of values
            incl_txt_fb: if False, columns with textual feedback will be skipped
            shading: method to shade rows (None, 'number' or 'cluster')
            cluster_col: zero indexed column number of the value used for
                shading
            cluster_del: do not write the cluster value to the sheet

        Returns:

            reference to the new worksheet and the last (row, col) position

        The configuration for the sheet is read from the report configuration
        file. The `sh_name` with which the sheet is created has to be present
        as first level key value in this configuration file.

        When `shading` is specified as 'number', the rows will get toggled
        background shading after one to three rows (depends on the total
        number of rows) for better readability. When `shading` is 'cluster',
        shading of rows will be toggled after the value in column
        `cluster_col` changes. This value will not be written to the sheet if
        `cluster_del` is True. As per default the sheet rows will have no
        shading.
        """

        # Get the specifications for this sheet
        try:
            with open(report_conf.rep_config_json) as f:
                sheet_spec = json.load(f)[sh_name]
        except KeyError:
            raise ValueError(
                f'no specifications available in {report_conf.rep_config_json} '
                f'for sheet named {sh_name}')

        col_specs = [s for s in sheet_spec['cols']
                     if incl_txt_fb or 'feedback txt' not in s[0]]

        # Create and configure the sheet
        sheet = self.add_worksheet(sh_name)
        for col, col_spec in enumerate(col_specs):
            sheet.write(0, col, col_spec[0], self.get_fmt('hdr'))
            sheet.set_column(col, col, col_spec[1])
        if sheet_spec['frozen_rows'] or sheet_spec['frozen_cols']:
            # With both zero, xlsxwriter delivers a partly corrupt xlsx
            sheet.freeze_panes(
                sheet_spec['frozen_rows'], sheet_spec['frozen_cols'])
        if sheet_spec['show_grid']:
            sheet.hide_gridlines(0)
        else:
            sheet.hide_gridlines(2)
        if sheet_spec['autofilter']:
            sheet.autofilter(0, 0, len(data), len(col_specs) - 1)

        # Write the data
        shaded = False
        shade_num = 0
        last_cluster = ''
        if shading == 'number':
            # Shading toggles any shade_num rows
            if len(data) <= 10:
                shade_num = 1
            elif len(data) < 20:
                shade_num = 2
            else:
                shade_num = 3
        row, col = 1, 0  # Initialize to avoid inspection warning
        for row, values in enumerate(data, start=1):
            if shading == 'number':
                # Toggle shading after `shade_num` rows
                shaded = row % (2 * shade_num) not in list(
                    range(1, shade_num + 1))
            elif shading == 'cluster':
                # Toggle shading after value of `cluster` changes
                if cluster_del:
                    values = list(values)
                    cluster = values.pop(cluster_col)
                else:
                    cluster = values[cluster_col]
                if not last_cluster:
                    last_cluster = cluster
                if cluster != last_cluster:
                    shaded = not shaded
                    last_cluster = cluster
            for col, value in enumerate(values):
                num_format, alignment = col_specs[col][2:4]
                if '/' in num_format:
                    for fmt in num_format.split('/'):
                        if ('flt' in fmt and type(value) == float) or \
                                ('int' in fmt and type(value) in (int, str)):
                            num_format = fmt
                            break
                cell_format = self.get_fmt(num_format, alignment, shaded)
                sheet.write(row, col, value, cell_format)

        # Return the resulting data structures
        return sheet, (row, col)

    def add_buttons(self) -> None:
        """
        **Create buttons to navigate to data sheets.**

        Sheet buttons are created on the 'Report' sheet as text boxes with
        clickable links to the various data sheets.
        """

        worksheets = self.worksheets().copy()
        report_sheet = worksheets.pop(0)
        head_spc = 35
        button_h = 40
        button_w = 180
        button_spc = 20
        lr_margin = 25

        report_sheet.insert_textbox(
            'D2', 'Go to sheet',
            {
                'width': button_w + 2 * lr_margin,
                'height': len(worksheets) * (button_h + button_spc) + head_spc,
                'y_offset': 10,
                'object_position': 3,
                'line': {'color': 'black'},
                'fill': {'color': 'white'},
                'align': {'horizontal': 'left'},
                'font': {'bold': True}
            }
        )
        for num, ws in enumerate(worksheets):
            report_sheet.insert_textbox(
                'D2', ws.name,
                {
                    'width': button_w,
                    'height': button_h,
                    'x_offset': lr_margin + 3,
                    'y_offset': 10 + head_spc + num * (button_h + button_spc),
                    'object_position': 3,
                    'line': {'color': report_conf.colour_btn_brdr, 'width': 1},
                    'fill': {'color': report_conf.colour_btn_fill},
                    'align': {'horizontal': 'center', 'vertical': 'middle'},
                    'font': {'color': report_conf.colour_btn_text,
                             'size': 12, 'bold': True},
                    'tip': 'press alt-left to return to the last visited sheet',
                    'url': f"internal:'{ws.name}'!A2"
                }
            )

    def close_and_publish(self) -> None:
        """
        **Call finalisation code, close file and publish it.**

        Publishing means copying the report to the configured publish
        directory (ref. `bd_www.Config`). In case this directory does not
        exist or is unreachable, an error will be logged but processing
        continues.
        """
        publ_dir = Path(report_conf.publ_dir)
        rep_file: Path = self.filename
        publ_file = publ_dir / rep_file.name
        super().close()
        try:
            shutil.copy(rep_file, publ_file)
        except FileNotFoundError:
            logger.error('Publishing the report was not successful')


def site_report(ts: str,
                report_set: str = 'std',
                full_info: bool = True,
                publish: bool = False,
                log_to_file: bool = True) -> None:
    """
    **Generate site reports.**

    Arguments:

        ts: timestamp of the scrape [yymmdd-hhmm] that defines the site status
        report_set: defines the types of reports to be generated ['min',
            'std' or 'max']
        full_info: add information (sheets) for all pages, links, redirs and
            paths
        publish: copy generated reports to the configured publish location
        log_to_file: log messages will be saved to file as well as to console

    Reports are generated for the site as it was during the scrape with
    timestamp `ts`. Each report is referenced to a scrape at a different
    logical (calendar) period earlier. Potential reports that will be
    generated are:

    - day: report covering one day without reference to another scrape
    - daily: report with reference to a scrape one day before
    - weekly: report with reference to a scrape one week before
    - one-monthly: report with reference to a scrape one month before
    - monthly: one-monthly report starting on the first day of the month
    - three-monthly: report starting on the first day of the month with
      reference to a scrape three months before
    - quarterly: three-monthly report starting on the first day of January,
      April, July or October
    - six-monthly: report starting on the first day of the month with reference
      to a scrape six months before
    - half-yearly: six-monthly report starting on the first day of January
      or July
    - twelve-monthly: report starting on the first day of the month with
      reference to a scrape twelve months before
    - yearly: twelve-monthly report starting on January 1st

    The actual value of the `report_set` parameter limits the reports that
    will be generated:

    - 'min': weekly on monday, (one-)monthly on first-day-of-month
    - 'std': all daily's, weekly on monday, all monthly's on first-day-of-month,
      potentially resulting also in quarterly, half-yearly and/or yearly
    - 'max': all possible (use with care!)
    - one value from the available report types ('daily', 'weekly', etc.)

    Only those reports will be generated for which the right reference scrape
    is available. If no earlier scrape exists for any of the logical calendar
    periods before, reporting falls back to a day report only (so without
    reference to any other scrape), but only if no specific type was requested.

    **Note:**
    *The actual workhorse to generate the necessary reports is the
    `report_period` function.*
    """

    # Setup logging
    global logger
    logger = logging.getLogger('report')
    logger.setLevel(logging.INFO)
    log_file = str(mst_dir / report_conf.log_name)
    formatter = logging.Formatter(
        fmt='[%(asctime)s] %(levelname)-8s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    if log_to_file:
        fh = logging.FileHandler(log_file)
        fh.setLevel(logging.INFO)
        fh.setFormatter(formatter)
        logger.addHandler(fh)

    def months_back(date: dt.date, months: int) -> dt.date:
        """
        **Shift a date some months back.**

        Arguments:

            date: reference date
            months: number of months to shift

        Returns:

            reference date minus the number of months
        """
        eom = date
        for i in range(months):
            eom = eom.replace(day=1) - one_day
        if eom.day >= date.day:
            return eom.replace(day=date.day)
        else:
            return eom

    valid_types = set(REPORT_SUBDIRS.keys())
    try:
        valid_types.remove('day')
    except KeyError:
        pass

    # Set reporting dates for all potential reports
    report = {}
    one_day = dt.timedelta(days=1)
    scr_date = ts_to_d(ts)
    per_end = scr_date - one_day
    #
    per_start = per_end
    ref_end = ref_start = per_start - one_day
    report['daily'] = [per_start, per_end, ref_start, ref_end]
    #
    per_start = scr_date - dt.timedelta(days=7)
    ref_end = per_start - one_day
    ref_start = per_start - dt.timedelta(days=7)
    report['weekly'] = [per_start, per_end, ref_start, ref_end]
    #
    if scr_date.day == 1:
        per_start = (scr_date - dt.timedelta(days=28)).replace(day=1)
        ref_end = per_start - one_day
        ref_start = (per_start - dt.timedelta(days=28)).replace(day=1)
        report['monthly'] = [per_start, per_end, ref_start, ref_end]
    else:
        per_start = months_back(scr_date, 1)
        ref_start = months_back(scr_date, 2)
        ref_end = per_start - one_day
        report['one-monthly'] = [per_start, per_end, ref_start, ref_end]
    #
    if scr_date.day == 1 and scr_date.month % 3 == 1:
        per_start = (scr_date - dt.timedelta(days=90)).replace(day=1)
        ref_end = per_start - one_day
        ref_start = (per_start - dt.timedelta(days=90)).replace(day=1)
        report['quarterly'] = [per_start, per_end, ref_start, ref_end]
    else:
        per_start = months_back(scr_date, 3)
        ref_start = months_back(scr_date, 6)
        ref_end = per_start - one_day
        report['three-monthly'] = [per_start, per_end, ref_start, ref_end]
    #
    if scr_date.day == 1 and scr_date.month % 6 == 1:
        per_start = (scr_date - dt.timedelta(days=181)).replace(day=1)
        ref_end = per_start - one_day
        ref_start = (per_start - dt.timedelta(days=181)).replace(day=1)
        report['half-yearly'] = [per_start, per_end, ref_start, ref_end]
    else:
        per_start = months_back(scr_date, 6)
        ref_start = months_back(scr_date, 12)
        ref_end = per_start - one_day
        report['six-monthly'] = [per_start, per_end, ref_start, ref_end]
    #
    if scr_date.day == 1 and scr_date.month == 1:
        per_start = (scr_date - dt.timedelta(days=365)).replace(day=1)
        ref_end = per_start - one_day
        ref_start = (per_start - dt.timedelta(days=365)).replace(day=1)
        report['yearly'] = [per_start, per_end, ref_start, ref_end]
    else:
        per_start = months_back(scr_date, 12)
        ref_start = months_back(scr_date, 24)
        ref_end = per_start - one_day
        report['twelve-monthly'] = [per_start, per_end, ref_start, ref_end]

    # Drop potential reports that are not requested
    for per_type in list(report.keys()):
        per_start, per_end, ref_start, ref_end = report[per_type]
        match report_set:
            case 'min':
                match per_type:
                    case 'weekly':
                        if per_start.isoweekday() != 1:
                            del report[per_type]
                        continue
                    case 'monthly':
                        if per_start.day != 1:
                            del report[per_type]
                        continue
                    case _:
                        del report[per_type]
                        continue
            case 'std':
                match per_type:
                    case 'weekly':
                        if per_start.isoweekday() != 1:
                            del report[per_type]
                        continue
                    case 'daily':
                        continue
                    case _:
                        if per_start.day != 1:
                            del report[per_type]
                        continue
            case 'max':
                continue
            case _:
                if report_set != per_type:
                    del report[per_type]
                continue

    # Form arguments for reports that are covered by available scrapes
    report_args = []
    for per_type in report.keys():
        per_start, per_end, ref_start, ref_end = report[per_type]
        if qry_result := mst_conn.execute(
                'SELECT timestamp FROM mst_scrapes WHERE date = ?',
                [per_start]).fetchone():
            ref_ts = qry_result[0]
            prev_ref_ts = mst_conn.execute(
                'SELECT min(timestamp) FROM mst_scrapes WHERE date >= ?',
                [ref_start]).fetchone()[0]
            if ref_start != ts_to_d(prev_ref_ts):
                # Invalid reference period
                prev_ref_ts = ref_ts
            report_args.append([ts, ref_ts, prev_ref_ts, per_type])

    # Handle case where no potential report remains
    if not report_args:
        if report_set in valid_types:
            msg = f'no {report_set} report produced for scrape {ts}'
            logger.info(msg)
            print(msg)
            return
        else:
            msg = f'no reference scrape available for scrape {ts}: ' \
                  f'only a day report will be produced'
            logger.info(msg)
            print(msg)
            report_args = [(ts, ts, None, 'day')]

    if report_set not in {'min', 'std', 'max'} | valid_types:
        # Report set is not valid; create day report and log warning
        logger.warning(f'illegal report set specified ({report_set}): '
                       f'no report created for scrape {ts}')
    else:
        # Create the reports
        logger.info(f'reporting for scrape {ts} started')
        for args in report_args:
            for report in report_conf.reports:
                report_conf.spec_report(report)
                report_period(*args, full_info=full_info, publish=publish)
        logger.info(f'reporting for scrape {ts} finished')


def ts_to_d(ts: str, delta_days: int = 0) -> dt.date:
    """
    **Convert timestamp to date with potential day shift.**

    Arguments:

        ts: timestamp with format yymmdd-hhmm
        delta_days: number of days to add

    Returns:

        converted timestamp

    The returned date is shifted with `delta_days`.

    Validity of the timestamp is not checked.
    """

    y = ts[0:2]
    m = ts[2:4]
    d = ts[4:6]
    year = int(y) if y.isdecimal() else 20
    month = int(m) if m.isdecimal() else 1
    day = int(d) if d.isdecimal() else 1
    return dt.date(2000 + year, month, day) + dt.timedelta(days=delta_days)


def report_period(rep_ts: str,
                  ref_ts: str,
                  pre_ts: str = None,
                  rep_type: str = 'day',
                  full_info: bool = True,
                  publish: bool = False) -> None:
    """
    **Write an Excel report for a given period.**

    Arguments:

        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period
        pre_ts: timestamp of the scrape that marks the (inclusive) start of the
            reference period
        rep_type: textual characterisation of the reporting period to be
            part of the report name
        full_info: add sheets with pages, links, downloads, redirs and paths
        publish: copy generated report to the configured publish location

    The report will be written as an Excel workbook to the subdirectory
    according to the `REPORT_SUBDIRS` constant within the configured report
    directory. A report with the same name will be overwritten.

    Apart from the sources of the pages, most of the contents of the scrape
    at the reporting timestamp `rep_ts` is comprised in the report.
    Additionally, the report contains page metrics and page feedbacks for the
    period, and the essential differences for all the site and each new,
    removed and changed page relative to the scrape at the reference
    timestamp `ref_ts`.

    In case the reference timestamp `ref_ts` equals the reporting timestamp
    `rep_ts`, a 'day' report will be generated, which will not contain
    reference to another scrape.

    The pre-reference timestamp `pre_ts` marks the start of the reference
    period, which is used to compare some period dependent (*dynamic*) key
    figures with the previous period. In case `pre_ts` is `None` (default) or
    equal to the reference timestamp `ref_ts` (leading to a reference period
    of one day), key figures that are dependent on the period length will not
    be referenced to another period.
    """

    # Set (inclusive) dates for the reporting period
    rep_end = ts_to_d(rep_ts, -1)
    rep_start = ts_to_d(ref_ts) if ref_ts != rep_ts else rep_end

    # Initiate the report workbook
    rep_name = report_conf.report_name
    report_dir = mst_dir / report_conf.report_dir_name
    xlsx_name = f'{rep_ts} - {rep_type} {rep_name}.xlsx'
    xlsx_path = report_dir / REPORT_SUBDIRS[rep_type] / xlsx_name
    if ref_ts != rep_ts:
        per_str = rep_start.strftime('%b %d, %Y') + ' / '
    else:
        per_str = 'single day: '
    report_info: list[tuple] = [
        ('Report period', per_str + rep_end.strftime('%b %d, %Y')),
        ('Timestamp status scrape', rep_ts),
        ('Timestamp reference scrape', ref_ts if ref_ts != rep_ts else 'None'),
        ('Report creation', time.strftime('%b %d, %Y %H:%M')),
        ('Report version', REPORT_VERSION),
        ('Root URL', ROOT_URL)
    ]
    wb = ReportWorkbook(xlsx_path, report_info)
    keyfigs_sheet(wb, rep_ts, ref_ts, pre_ts)
    data_legend_sheet(wb)

    # Add sheets with reference data
    if ref_ts != rep_ts:
        rem_pages_sheet(wb, rep_ts, ref_ts, report_conf.incl_fb)
        new_pages_sheet(wb, rep_ts, ref_ts, report_conf.incl_fb)
        changed_aspects_sheet(wb, rep_ts, ref_ts)

    # Add sheets with scrape details
    if full_info:
        all_pages_sheet(wb, rep_ts, ref_ts, report_conf.incl_fb)
        if report_conf.incl_fb:
            all_feedback_sheet(wb, rep_ts, ref_ts)
        ed_links_sheet(wb, rep_ts)
        downloads_sheet(wb, rep_ts, ref_ts)
        redirs_aliases_sheet(wb, rep_ts)
        paths_sheet(wb, rep_ts, ref_ts)

    wb.add_buttons()
    if publish:
        wb.close_and_publish()
        logger.info(f'report generated and published: {xlsx_name}')
    else:
        wb.close()
        logger.info(f'report generated: {xlsx_name}')


def keyfigs_sheet(wb: ReportWorkbook,
                  rep_ts: str, ref_ts: str, pre_ts: str) -> None:
    """
    **Add a report sheet with key figures.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period
        pre_ts: timestamp of the scrape that marks the (inclusive) start of the
            reference period

    The sheet will contain two types of key figures, which can only implicitly
    be distinguished from each other:

    - figures describing the site *status* at the end of the reporting period
    - figures representing the *dynamics* of the site over the reporting period

    The first (*static*) type is readily available for all timestamps in the
    *his_status_figures* table of the scrape master database. The second
    (*dynamic*) type, consisting of key figures for metrics, feedback and
    editorial text modifications, is calculated by this function.

    In case the reference start timestamp `pre_ts` is `None` or equal to the
    reference end timestamp `ref_ts`, *dynamic* key figures will not be
    referenced.
    """

    # Set (inclusive) dates for the reporting and reference period
    rep_end = ts_to_d(rep_ts, -1)
    rep_start = ts_to_d(ref_ts) if ref_ts != rep_ts else rep_end

    # Gather the key figures to report:
    # - get the dynamic ones
    rep_kfigs = metric_keyfigs(rep_ts, rep_start, rep_end)
    rep_kfigs |= feedback_keyfigs(rep_ts, rep_start, rep_end)
    rep_kfigs |= modpages_keyfigs(rep_ts, ref_ts)
    rep_kfigs |= rempages_keyfigs(rep_ts, ref_ts)
    rep_kfigs |= newpages_keyfigs(rep_ts, ref_ts)
    # - store their names for later use
    rep_dyn_kf_names = [k for k in rep_kfigs.keys()]
    # - add the static ones
    rep_kfigs |= status_keyfigs(rep_ts)

    # Prepare the data for the sheet
    kf_details = keyfig_details()
    sheet_values = []
    if ref_ts != rep_ts:
        # Add reference values to compare the report values with
        ref_kfigs = status_keyfigs(ref_ts)
        if pre_ts and pre_ts != ref_ts:
            # Get the reference values for the dynamic key figures
            ref_end = ts_to_d(ref_ts, -1)
            ref_start = ts_to_d(pre_ts)
            ref_kfigs |= metric_keyfigs(ref_ts, ref_start, ref_end)
            ref_kfigs |= feedback_keyfigs(ref_ts, ref_start, ref_end)
            ref_kfigs |= rempages_keyfigs(ref_ts, pre_ts)
            ref_kfigs |= newpages_keyfigs(ref_ts, pre_ts)
            ref_kfigs |= modpages_keyfigs(ref_ts, pre_ts)
        else:
            # The reference period is not known, so add 'x' references for
            # the dynamic key figures.
            for kf_name in rep_dyn_kf_names:
                # noinspection PyTypeChecker
                ref_kfigs[kf_name] = 'x'
        # Calculate delta between reporting and reference values and add key
        # figure details.
        for kf_name in set(rep_kfigs.keys()) | set(ref_kfigs.keys()):
            seq_nr, cluster, description = kf_details.get(kf_name, [0, '', ''])
            rep_val = rep_kfigs.get(kf_name, 0)
            if rep_val == 'x':
                rep_val = ''
                delta = ''
            else:
                ref_val = ref_kfigs.get(kf_name, 0)
                delta = '' if ref_val == 'x' else rep_val - ref_val
            sheet_values.append(
                [seq_nr, cluster, description, kf_name, rep_val, delta]
            )
    else:
        # Add key figure details
        for kf_name in rep_kfigs.keys():
            seq_nr, cluster, description = kf_details.get(kf_name, [0, '', ''])
            rep_val = rep_kfigs.get(kf_name, 0)
            if rep_val == 'x':
                rep_val = ''
            sheet_values.append(
                [seq_nr, cluster, description, kf_name, rep_val]
            )

    # Sort on sequence number and remove this number from the data
    sheet_values.sort()
    for row in sheet_values:
        del row[0]

    # Write the data
    wb.add_datasheet('Key figures', sheet_values,
                     shading='cluster', cluster_del=True)


def status_keyfigs(ts: str) -> dict[str, int]:
    """
    **Get the status figures of a scrape.**

    Arguments:

        ts: timestamp of the scrape [yymmdd-hhmm]

    Returns:

        name, value dictionary of the key figures

    Just returns the name/value pairs from the *his_status_figures* table
    with timestamp `ts`.
    """

    qry = f'''
        SELECT name, value
        FROM his_status_figures
        WHERE timestamp = '{ts}' '''
    return {name: value for name, value in mst_conn.execute(qry).fetchall()}


def metric_keyfigs(
        ts: str, first_date: dt.date, last_date: dt.date) -> dict[str, int]:
    """
    **Get metrics key figures for the period.**

    Arguments:

        ts: timestamp of the scrape [yymmdd-hhmm] that defines the (exclusive)
            end of the period
        first_date: first day of the period
        last_date: last day of the period

    Returns:

        name, value dictionary of the key figures

    Next key figures are returned:

    - views_total: pageviews of all pages
    - pages_visits_low: number of pages with less than 100 visits
    - pages_visits_xlow: number of pages with less than 10 visits
    - pages_visits_no: number of pages with no visits
    - views_lang_`<language>`: pageviews per *language*
    - views_buss_`<business>`: pageviews per *business*
    - views_cat_`<category>`: pageviews per *category*

    In case the `days` table of the metrics master database does not cover
    all days of the period, the key figures will be returned with a (textual)
    value of 'x'.
    """

    per_days = (last_date - first_date).days + 1
    if per_days < 1:
        raise ValueError('invalid period length')
    per_metrics(first_date, last_date)
    qry = f'''
        WITH
            -- All days in period
            days AS (
                SELECT date_id
                FROM dates
                WHERE date >= '{first_date}' AND date <= '{last_date}'
            ),
            -- Check if metrics are available for all days in period 
            per_chk AS (
                SELECT count(*) = {per_days} AS ok
                FROM days
            ),
            -- Last known language of all pages until timestamp
            page_lang AS (
                SELECT DISTINCT
                    page_id,
                    last_value(language) OVER (
                        PARTITION BY page_id
                        ORDER BY timestamp
                        ROWS BETWEEN UNBOUNDED PRECEDING
                            AND UNBOUNDED FOLLOWING
                    ) AS language
                FROM his_pages_info
                WHERE timestamp <= '{ts}'            
            ),
            -- Last known business of all pages until timestamp
            page_buss AS (
            SELECT DISTINCT
                page_id,
                last_value(business) OVER (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING
                        AND UNBOUNDED FOLLOWING
                ) AS business
                FROM his_pages_info
                WHERE timestamp <= '{ts}'
            ),
            -- Last known category of all pages until timestamp
            page_cat AS (
            SELECT DISTINCT
                page_id,
                last_value(category) OVER (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING
                        AND UNBOUNDED FOLLOWING
                ) AS category
                FROM his_pages_info
                WHERE timestamp <= '{ts}'
            ),
            metric_keyfigs (name, value) AS (
                SELECT 'views_total', iif(per_chk.ok, sum(views), 'x')
                FROM per_metrics, per_chk
                    UNION
                SELECT 'pages_visits_low', iif(per_chk.ok, count(*), 'x')
                FROM per_metrics, per_chk
                WHERE visits < 100
                    UNION
                SELECT 'pages_visits_xlow', iif(per_chk.ok, count(*), 'x')
                FROM per_metrics, per_chk
                WHERE visits < 10
                    UNION
                SELECT 'pages_visits_no', iif(per_chk.ok, count(*), 'x')
                FROM per_metrics, per_chk
                WHERE visits = 0
                    UNION
                SELECT
                    'views_lang_' || ifnull(language, 'None'),
                    iif(per_chk.ok, sum(views), 'x')
                FROM per_metrics, per_chk
                LEFT JOIN page_lang USING (page_id)
                GROUP BY language
                    UNION
                SELECT
                    'views_buss_' || ifnull(business, 'None'),
                    iif(per_chk.ok, sum(views), 'x')
                FROM per_metrics, per_chk
                LEFT JOIN page_buss USING (page_id)
                GROUP BY business
                    UNION
                SELECT
                    'views_cat_' || ifnull(category, 'None'),
                    iif(per_chk.ok, sum(views), 'x')
                FROM per_metrics, per_chk
                LEFT JOIN page_cat USING (page_id)
                GROUP BY category
            )
        SELECT name, value
        FROM metric_keyfigs'''
    return {name: value for name, value in mst_conn.execute(qry).fetchall()}


def feedback_keyfigs(
        ts: str, first_date: dt.date, last_date: dt.date) -> dict[str, int]:
    """
    **Get feedback key figures for the period.**

    Arguments:

        ts: timestamp of the scrape [yymmdd-hhmm] that defines the (exclusive)
            end of the period
        first_date: first day of the period
        last_date: last day of the period

    Returns:

        name, value dictionary of the key figures

    Next positive and negative feedback key figures are returned:

    - feedback_pos_total
    - feedback_neg_total
    - feedback_sat-rate_per
    - feedback_sat-rate_ytd
    - feedback_<pos/neg>_lang_`<language>` per *language*
    - feedback_<pos/neg>_buss_`<business>` per *business*
    - feedback_<pos/neg>_cat_`<category>` per *category*

    In case the `days` table of the metrics master database does not cover
    all days of the period, the key figures will be returned with a (textual)
    value of 'x'.
    """
    per_days = (last_date - first_date).days + 1
    if per_days < 1:
        raise ValueError('invalid period length')
    per_pages(first_date, last_date)
    qry = f'''
        WITH
            -- All days in period
            per_days AS (
                SELECT date_id
                FROM dates
                WHERE date >= '{first_date}' AND date <= '{last_date}'
            ),
            -- All days year-to-date
            ytd_days AS (
                SELECT date_id
                FROM dates
                WHERE date >= substr('{last_date}', 1, 4) || '-01-01'
                    AND date <= '{last_date}'
            ),
            -- Check if metrics are available for all days in period 
            per_chk AS (
                SELECT count(*) =
                        julianday('{last_date}') 
                        - julianday('{first_date}') + 1
                    AS ok
                FROM per_days
            ),
            -- Check if metrics are available for all days year-to-date 
            ytd_chk AS (
                SELECT count(*) = 
                        julianday('{last_date}') 
                        - julianday(substr('{last_date}', 1, 4) || '-01-01') + 1
                    AS ok
                FROM ytd_days
            ),
            -- Period totals per page
            per_feedback AS (
                SELECT
                    page_id,
                    sum(pos_cnt) AS pos_cnt,
                    sum(neg_cnt) AS neg_cnt,
                    group_concat(pos_txt, char(10)) AS pos_txt,
                    group_concat(neg_txt, char(10)) AS neg_txt
                FROM feedback
                WHERE date_id IN per_days
                    -- Exclude pages that were not detected to be alive in the
                    -- period (which happened with earlier scrapes):
                    AND page_id IN per_pages
                GROUP BY page_id
            ),
            -- Year-to-date totals
            ytd_fb_tot AS (
                SELECT
                    sum(pos_cnt) AS pos_cnt,
                    sum(neg_cnt) AS neg_cnt
                FROM feedback
                WHERE date_id IN ytd_days
            ),
            -- Last known language of all pages until timestamp
            page_lang AS (
                SELECT DISTINCT
                    page_id,
                    last_value(language) OVER (
                        PARTITION BY page_id
                        ORDER BY timestamp
                        ROWS BETWEEN UNBOUNDED PRECEDING
                            AND UNBOUNDED FOLLOWING
                    ) AS language
                FROM his_pages_info
                WHERE timestamp <= '{ts}'
            ),
            -- Last known business of all pages until timestamp
            page_buss AS (
            SELECT DISTINCT
                page_id,
                last_value(business) OVER (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING
                        AND UNBOUNDED FOLLOWING
                ) AS business
                FROM his_pages_info
                WHERE timestamp <= '{ts}'
            ),
            -- Last known category of all pages until timestamp
            page_cat AS (
            SELECT DISTINCT
                page_id,
                last_value(category) OVER (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING
                        AND UNBOUNDED FOLLOWING
                ) AS category
                FROM his_pages_info
                WHERE timestamp <= '{ts}'
            ),
            feedback_keyfigs (name, value) AS (
                SELECT
                    'feedback_pos_total',
                    iif(per_chk.ok, sum(pos_cnt), 'x')
                FROM per_feedback, per_chk
                    UNION
                SELECT
                    'feedback_pos_lang_' || ifnull(language, 'None'),
                    iif(per_chk.ok, sum(pos_cnt), 'x')
                FROM per_feedback, per_chk
                LEFT JOIN page_lang USING (page_id)
                GROUP BY language
                    UNION
                SELECT
                    'feedback_pos_buss_' || ifnull(business, 'None'),
                    iif(per_chk.ok, sum(pos_cnt), 'x')
                FROM per_feedback, per_chk
                LEFT JOIN page_buss USING (page_id)
                GROUP BY business
                    UNION
                SELECT
                    'feedback_pos_cat_' || ifnull(category, 'None'),
                    iif(per_chk.ok, sum(pos_cnt), 'x')
                FROM per_feedback, per_chk
                LEFT JOIN page_cat USING (page_id)
                GROUP BY category
                    UNION
                SELECT
                    'feedback_neg_total',
                    iif(per_chk.ok, sum(neg_cnt), 'x')
                FROM per_feedback, per_chk
                    UNION
                SELECT
                    'feedback_neg_lang_' || ifnull(language, 'None'),
                    iif(per_chk.ok, sum(neg_cnt), 'x')
                FROM per_feedback, per_chk
                LEFT JOIN page_lang USING (page_id)
                GROUP BY language
                    UNION
                SELECT
                    'feedback_neg_buss_' || ifnull(business, 'None'),
                    iif(per_chk.ok, sum(neg_cnt), 'x')
                FROM per_feedback, per_chk
                LEFT JOIN page_buss USING (page_id)
                GROUP BY business
                    UNION
                SELECT
                    'feedback_neg_cat_' || ifnull(category, 'None'),
                    iif(per_chk.ok, sum(neg_cnt), 'x')
                FROM per_feedback, per_chk
                LEFT JOIN page_cat USING (page_id)
                GROUP BY category
                    UNION
                SELECT
                    'feedback_sat-rate_per',
                    iif(per_chk.ok,
                        CAST(sum(pos_cnt) AS REAL) 
                            / (sum(pos_cnt) + sum(neg_cnt)), 
                        'x')
                FROM per_feedback, per_chk
                    UNION
                SELECT
                    'feedback_sat-rate_ytd',
                    iif(ytd_chk.ok,
                        CAST(pos_cnt AS REAL) / (pos_cnt + neg_cnt), 
                        'x')
                FROM ytd_fb_tot, ytd_chk
            )
        SELECT name, value
        FROM feedback_keyfigs
        '''
    return {name: value for name, value in mst_conn.execute(qry).fetchall()}


def rempages_keyfigs(rep_ts: str, ref_ts: str) -> dict[str, int]:
    """
    **Get key figures for removed pages since reference scrape.**

    Arguments:

        rep_ts: timestamp of the reporting scrape [yymmdd-hhmm]
        ref_ts: timestamp of the reference scrape [yymmdd-hhmm]

    Returns:

        name, value dictionary of the key figures

    Next key figures are returned:

    - pages_rem: total number of removed pages
    - pages_lan_`<language>`_rem: number of removed pages per *language*
    - pages_buss_`<business>`_rem: number of removed pages per *business*
    - pages_cat_`<category>`_rem: number of removed pages per *category*

    When both timestamps (`rep_ts` and `ref_ts`) are the same, the key
    figures will be returned with a (textual) value of 'x'.
    """

    # Handle special cases
    if not ref_ts or not rep_ts or ref_ts > rep_ts:
        raise ValueError('invalid period')
    if ref_ts == rep_ts:
        qry = f'''
            WITH
                -- Last known different combinations of language, business and
                -- category until reporting timestamp.
                lan_bus_cat_combis AS (
                    SELECT DISTINCT
                        last_value(language) OVER win AS language,
                        last_value(business) OVER win AS business,
                        last_value(category) OVER win AS category
                    FROM his_pages_info
                    WHERE timestamp <= '{rep_ts}'
                    WINDOW win AS (
                        PARTITION BY page_id
                        ORDER BY timestamp
                        ROWS BETWEEN UNBOUNDED PRECEDING
                            AND UNBOUNDED FOLLOWING
                    )
                ),
                rempages_keyfigs (name, value) AS (
                    SELECT 'pages_rem', 'x'
                        UNION
                    SELECT
                        'pages_lang_' || ifnull(language, 'None') || '_rem',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY language
                        UNION
                    SELECT
                        'pages_buss_' || ifnull(business, 'None') || '_rem',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY business
                        UNION
                    SELECT
                        'pages_cat_' || ifnull(category, 'None') || '_rem',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY category
                )
            SELECT name, value
            FROM rempages_keyfigs'''
        logger.info('key figures for removed pages requested for '
                    'period with zero length')
        return {name: value for name, value in mst_conn.execute(qry).fetchall()}

    # Store the key figures for the regular situation
    qry = f'''
        WITH
            -- All timestamps within the reporting period
            period AS (
                SELECT timestamp
                FROM mst_scrapes
                WHERE timestamp > '{ref_ts}'
                    AND timestamp <= '{rep_ts}'
            ),
            -- All removed pages in period
            removed_pages AS (
                SELECT page_id
                FROM his_pages_life
                WHERE timestamp in period
                    AND NOT alive
            ),
            -- Language, business and category aspects of the removed pages
            lan_bus_cat_aspects AS (
                SELECT DISTINCT
                    page_id,
                    last_value(language) OVER win AS language,
                    last_value(business) OVER win AS business,
                    last_value(category) OVER win AS category
                FROM his_pages_info
                WHERE timestamp <= '{rep_ts}' AND page_id IN removed_pages
                WINDOW win AS (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING
                        AND UNBOUNDED FOLLOWING
                )
            ),
            rempages_keyfigs (name, value) AS (
                SELECT 'pages_rem', count(*)
                FROM removed_pages
                    UNION
                SELECT
                    'pages_lang_' || ifnull(language, 'None') || '_rem',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY language
                    UNION
                SELECT
                    'pages_buss_' || ifnull(business, 'None') || '_rem',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY business
                    UNION
                SELECT
                    'pages_cat_' || ifnull(category, 'None') || '_rem',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY category
            )
        SELECT name, value
        FROM rempages_keyfigs'''
    return {name: value for name, value in mst_conn.execute(qry).fetchall()}


def newpages_keyfigs(rep_ts: str, ref_ts: str) -> dict[str, int]:
    """
    **Get key figures for new pages since reference scrape.**

    Arguments:

        rep_ts: timestamp of the reporting scrape [yymmdd-hhmm]
        ref_ts: timestamp of the reference scrape [yymmdd-hhmm]

    Returns:

        name, value dictionary of the key figures

    Next key figures are returned:

    - pages_new: total number of new pages
    - pages_lan_`<language>`_new: number of new pages per *language*
    - pages_buss_`<business>`_new: number of new pages per *business*
    - pages_cat_`<category>`_new: number of new pages per *category*

    When both timestamps (`rep_ts` and `ref_ts`) are the same, the key
    figures will be returned with a (textual) value of 'x'.
    """

    # Handle special cases
    if not ref_ts or not rep_ts or ref_ts > rep_ts:
        raise ValueError('invalid period')
    if ref_ts == rep_ts:
        qry = f'''
            WITH
                -- Last known different combinations of language, business and
                -- category until reporting timestamp.
                lan_bus_cat_combis AS (
                    SELECT DISTINCT
                        last_value(language) OVER win AS language,
                        last_value(business) OVER win AS business,
                        last_value(category) OVER win AS category
                    FROM his_pages_info
                    WHERE timestamp <= '{rep_ts}'
                    WINDOW win AS (
                        PARTITION BY page_id
                        ORDER BY timestamp
                        ROWS BETWEEN UNBOUNDED PRECEDING
                            AND UNBOUNDED FOLLOWING
                    )
                ),
                newpages_keyfigs (name, value) AS (
                    SELECT 'pages_new', 'x'
                        UNION
                    SELECT
                        'pages_lang_' || ifnull(language, 'None') || '_new',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY language
                        UNION
                    SELECT
                        'pages_buss_' || ifnull(business, 'None') || '_new',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY business
                        UNION
                    SELECT
                        'pages_cat_' || ifnull(category, 'None') || '_new',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY category
                )
            SELECT name, value
            FROM newpages_keyfigs'''
        logger.info(
            'key figures for new pages requested for period with zero length')
        return {name: value for name, value in mst_conn.execute(qry).fetchall()}

    # Store the key figures for the regular situation
    qry = f'''
        WITH
            -- All timestamps within the reporting period
            period AS (
                SELECT timestamp
                FROM mst_scrapes
                WHERE timestamp > '{ref_ts}'
                    AND timestamp <= '{rep_ts}'
            ),
            -- All new pages in period
            new_pages AS (
                SELECT page_id
                FROM his_pages_life
                WHERE timestamp in period
                    AND alive
            ),
            -- Language, business and category aspects of the new pages
            lan_bus_cat_aspects AS (
                SELECT DISTINCT
                    page_id,
                    last_value(language) OVER win AS language,
                    last_value(business) OVER win AS business,
                    last_value(category) OVER win AS category
                FROM his_pages_info
                WHERE timestamp <= '{rep_ts}' AND page_id IN new_pages
                WINDOW win AS (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING
                        AND UNBOUNDED FOLLOWING
                )
            ),
            newpages_keyfigs (name, value) AS (
                SELECT 'pages_new', count(*)
                FROM new_pages
                    UNION
                SELECT
                    'pages_lang_' || ifnull(language, 'None') || '_new',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY language
                    UNION
                SELECT
                    'pages_buss_' || ifnull(business, 'None') || '_new',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY business
                    UNION
                SELECT
                    'pages_cat_' || ifnull(category, 'None') || '_new',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY category
            )
        SELECT name, value
        FROM newpages_keyfigs'''
    return {name: value for name, value in mst_conn.execute(qry).fetchall()}


def modpages_keyfigs(rep_ts: str, ref_ts: str,
                     min_modification: float = 0.005) -> dict[str, int]:
    """
    **Get key figures for page modifications since reference scrape.**

    Arguments:

        rep_ts: timestamp of the reporting scrape [yymmdd-hhmm]
        ref_ts: timestamp of the reference scrape [yymmdd-hhmm]
        min_modification: modification factor threshold below which
            modifications will not be counted

    Returns:

        name, value dictionary of the key figures

    For all pages alive at some moment within the period defined by
    `ref_ts` and `rep_ts`, the next key figures are returned for pages from
    which the editorial content was, or was not modified above the
    `min_modification` threshold:

    - pages_mod: total number of modified pages
    - pages_lan_`<language>`_mod: number of modified pages per *language*
    - pages_buss_`<business>`_mod: number of modified pages per *business*
    - pages_cat_`<category>`_mod: number of modified pages per *category*
    - pages_unm: total number of unmodified pages
    - pages_lan_`<language>`_unm: number of unmodified pages per *language*
    - pages_buss_`<business>`_unm: number of unmodified pages per *business*
    - pages_cat_`<category>`_unm: number of unmodified pages per *category*

    When both timestamps (`rep_ts` and `ref_ts`) are the same, the key
    figures will be returned with a (textual) value of 'x'.
    """

    # Handle the special cases
    if not ref_ts or not rep_ts or ref_ts > rep_ts:
        raise ValueError('invalid period')
    if ref_ts == rep_ts:
        qry = f'''
            WITH
                -- Last known different combinations of language, business and
                -- category until reporting timestamp.
                lan_bus_cat_combis AS (
                    SELECT DISTINCT
                        last_value(language) OVER win AS language,
                        last_value(business) OVER win AS business,
                        last_value(category) OVER win AS category
                    FROM his_pages_info
                    WHERE timestamp <= '{rep_ts}'
                    WINDOW win AS (
                        PARTITION BY page_id
                        ORDER BY timestamp
                        ROWS BETWEEN UNBOUNDED PRECEDING
                            AND UNBOUNDED FOLLOWING
                    )
                ),
                mod_keyfigs (name, value) AS (
                    SELECT 'pages_mod', 'x'
                        UNION
                    SELECT
                        'pages_lang_' || ifnull(language, 'None') || '_mod',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY language
                        UNION
                    SELECT
                        'pages_buss_' || ifnull(business, 'None') || '_mod',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY business
                        UNION
                    SELECT
                        'pages_cat_' || ifnull(category, 'None') || '_mod',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY category
                )
            SELECT name, value
            FROM mod_keyfigs'''
        logger.info('key factors for editorial text modifications requested '
                    'for period with zero length')
        mod_keyfigs = {name: value for name, value
                       in mst_conn.execute(qry).fetchall()}
        unm_keyfigs = {}
        for mod_name, mod_value in mod_keyfigs.items():
            unm_name = mod_name[:-3] + 'unm'
            unm_keyfigs[unm_name] = 'x'
        return mod_keyfigs | unm_keyfigs

    # Create table with modifications of editorial texts within period
    mst_conn.execute(f'''
        CREATE TEMPORARY TABLE edtext_mods AS
        WITH
            -- All timestamps within the reporting period
            period AS (
                SELECT timestamp
                FROM mst_scrapes
                WHERE timestamp > '{ref_ts}'
                    AND timestamp <= '{rep_ts}'
            ),
            -- All stored ed_text values per page, together with their
            -- previous value.
            curr_and_prev_edtext AS (
                SELECT page_id, timestamp, ed_text,
                    first_value(timestamp) OVER win1 AS prev_timestamp,
                    first_value(ed_text) OVER win1 AS prev_ed_text
                FROM his_pages_info
                WINDOW win1 AS (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS 1 PRECEDING
                )
            ),
            -- Language, business and category aspects of all pages
            lan_bus_cat_aspects AS (
                SELECT DISTINCT
                    page_id,
                    last_value(language) OVER win2 AS language,
                    last_value(business) OVER win2 AS business,
                    last_value(category) OVER win2 AS category
                FROM his_pages_info
                WHERE timestamp <= '{rep_ts}'
                WINDOW win2 AS (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING
                        AND UNBOUNDED FOLLOWING
                )
            )
        SELECT DISTINCT
            page_id, language, business, category,
            first_value(prev_ed_text) OVER win3 AS begin_edtext,
            last_value(ed_text) OVER win3 AS end_edtext,
            0 AS mod_fact
        FROM curr_and_prev_edtext
        LEFT JOIN lan_bus_cat_aspects USING (page_id)
        WHERE ed_text != prev_ed_text AND timestamp IN period
        WINDOW win3 AS (
            PARTITION BY page_id
            ORDER BY timestamp
            ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
        )
        ''')
    # Calculate and update the modification factors
    qry = '''
        SELECT page_id, begin_edtext, end_edtext
        FROM edtext_mods
        '''
    for rec in mst_conn.execute(qry).fetchall():
        mst_conn.execute(f'''
            UPDATE edtext_mods
            SET mod_fact = {mod_factor(rec[1], rec[2])}
            WHERE page_id = {rec[0]}
            ''')
    # Count the page modification key figures
    qry = f'''
        WITH
            edtext_keyfigs (name, value) AS (
                SELECT 'pages_mod', count(*)
                FROM edtext_mods
                WHERE mod_fact >= {min_modification}
                    UNION
                SELECT
                    'pages_lang_' || ifnull(language, 'None') || '_mod',
                    count(*)
                FROM edtext_mods
                WHERE mod_fact >= {min_modification}
                GROUP BY language
                    UNION
                SELECT
                    'pages_buss_' || ifnull(business, 'None') || '_mod',
                    count(*)
                FROM edtext_mods
                WHERE mod_fact >= {min_modification}
                GROUP BY business
                    UNION
                SELECT
                    'pages_cat_' || ifnull(category, 'None') || '_mod',
                    count(*)
                FROM edtext_mods
                WHERE mod_fact >= {min_modification}
                GROUP BY category
            )
        SELECT name, value
        FROM edtext_keyfigs
        '''
    mod_keyfigs = {name: value for name, value
                   in mst_conn.execute(qry).fetchall()}
    mst_conn.execute('DROP TABLE edtext_mods')
    # Get key figures for living pages in period as intermediate result
    qry = f'''
        WITH
            -- All timestamps within the reporting period
            period AS (
                SELECT timestamp
                FROM mst_scrapes
                WHERE timestamp > '{ref_ts}'
                    AND timestamp <= '{rep_ts}'
            ),
            -- Page alive values at start of period
            life_at_start AS (
                SELECT DISTINCT
                    page_id,
                    last_value(alive) OVER (
                        PARTITION BY page_id
                        ORDER BY timestamp
                        ROWS BETWEEN UNBOUNDED PRECEDING
                            AND UNBOUNDED FOLLOWING
                    ) AS alive
                FROM his_pages_life
                WHERE timestamp <= '{ref_ts}'
            ),
            -- Pages alive at some moment in period
            alive_in_period AS (
                -- Pages alive at start of period
                SELECT page_id
                FROM life_at_start
                WHERE alive
                    UNION
                -- All new pages in period
                SELECT page_id
                FROM his_pages_life
                WHERE timestamp in period AND alive
            ),
            -- Language, business and category aspects of pages alive in period
            lan_bus_cat_aspects AS (
                SELECT DISTINCT
                    page_id,
                    last_value(language) OVER win AS language,
                    last_value(business) OVER win AS business,
                    last_value(category) OVER win AS category
                FROM his_pages_info
                WHERE timestamp <= '{rep_ts}' AND page_id IN alive_in_period
                WINDOW win AS (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING
                        AND UNBOUNDED FOLLOWING
                )
            ),
            -- Key figures for pages alive in period (not used in report)
            living_keyfigs (name, value) AS (
                SELECT 'pages_liv', count(*)
                FROM alive_in_period
                    UNION
                SELECT
                    'pages_lang_' || ifnull(language, 'None') || '_liv',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY language
                    UNION
                SELECT
                    'pages_buss_' || ifnull(business, 'None') || '_liv',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY business
                    UNION
                SELECT
                    'pages_cat_' || ifnull(category, 'None') || '_liv',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY category
            )
        SELECT name, value
        FROM living_keyfigs
        '''
    liv_keyfigs = {name: value for name, value
                   in mst_conn.execute(qry).fetchall()}
    unm_keyfigs = {}
    for liv_name, liv_val in liv_keyfigs.items():
        mod_name = liv_name[:-3] + 'mod'
        mod_val = mod_keyfigs.get(mod_name, 0)
        unm_name = liv_name[:-3] + 'unm'
        unm_val = liv_val - mod_val
        unm_keyfigs[unm_name] = unm_val
    # Return the (un)modified key figures, but not the living
    return mod_keyfigs | unm_keyfigs


def keyfig_details(language: str = 'english') -> dict[str, list[int, str, str]]:
    """
    **Get details for all potential key figures.**

    Arguments:

        language: language of the description to return; 'dutch' will return
            Dutch descriptions, any other value will return them in English

    Returns:

        seq_nr, cluster, description per key figure

    The details are read from the configured xlsx-file `kf_details_name` which
    has five fields per line:

    - sequence number for ordering
    - key figure name
    - cluster name for visual grouping
    - dutch description
    - english description
    """

    file = Path(report_conf.kf_details_name)
    ws = openpyxl.load_workbook(file).active
    kf_details = {}
    for row_nr, (seq_nr, name, cluster, dutch, english) \
            in enumerate(ws.values, start=1):
        if row_nr == 1:
            # Skip row with field names
            continue
        kf_details[name] = [
            int(seq_nr), cluster, dutch if language == 'dutch' else english
        ]
    return kf_details


def data_legend_sheet(wb: ReportWorkbook) -> None:
    """
    **Add report sheet with column legend.**

    Arguments:

        wb: workbook in which the new worksheet is created

    The sheet gives descriptions of columns in the sheets of the report that
    may not be self explanatory. These descriptions are read from the
    xlsx-file that is configured via the `data_legend_name` field in the [
    REPORT] section of the configuration file (see documentation of
    `bd_www.Config`).
    """

    dl_file = Path(report_conf.data_legend_name)
    ws = openpyxl.load_workbook(dl_file).active
    legend_details = [row for row in ws.values]
    del legend_details[0]  # Remove row with field names
    wb.add_datasheet('Data legend', legend_details,
                     shading='cluster', cluster_col=1, cluster_del=True)


def rem_pages_sheet(wb: ReportWorkbook,
                    rep_ts: str, ref_ts: str, incl_txt_fb: bool = True) -> None:
    """
    **Add report sheet with data of removed pages.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period
        incl_txt_fb: if False, no textual feedback will be included

    This sheet lists the details of all pages that were removed since the
    reference scrape.
    """

    # Set (inclusive) dates for the reporting period
    rep_end = ts_to_d(rep_ts, -1)
    rep_start = ts_to_d(ref_ts) if ref_ts else rep_end

    # Create temporary views
    page_themes()
    per_metrics(rep_start, rep_end)
    per_feedback(rep_start, rep_end)

    qry = f'''
        WITH
            -- Pages removed in the period with the timestamp of removal
            page_removals (page_id, rem_ts) AS (
                SELECT page_id, timestamp
                FROM his_pages_life
                WHERE alive = 0
                    AND timestamp > '{ref_ts}'
                    AND timestamp <= '{rep_ts}'
            ),
            -- Info timestamp of the removed pages
            rem_pages_info_ts (page_id, rem_ts, info_ts) AS (
                SELECT DISTINCT
                    page_id, rem_ts,
                    last_value(timestamp) OVER (
                        PARTITION BY page_id, rem_ts
                        ORDER BY timestamp
                        ROWS BETWEEN UNBOUNDED PRECEDING
                            AND UNBOUNDED FOLLOWING
                    )
                FROM page_removals
                LEFT JOIN his_pages_info USING (page_id)
                WHERE timestamp <= rem_ts
            ),
            -- All aspects of the removed pages
            removed_pages_aspects AS (
                SELECT rem_ts, i.*
                FROM rem_pages_info_ts AS r
                LEFT JOIN his_pages_info AS i
                    ON r.page_id = i.page_id AND r.info_ts = i.timestamp
            )
        SELECT
            rem_ts, page_id, path, 
            title, description, first_h1, language, modified, pagetype,
            classes, theme, business, category,
            ed_text, aut_text,
            visits, views, entries, bounces, bounce_rate, exits, exit_rate,
            organic_entries, organic_entry_rate, call_visits, call_rate,
            neg_fb_cnt, pos_fb_cnt,
            ifnull(CAST (neg_fb_cnt + pos_fb_cnt AS REAL) / visits,0)
                AS fb_rate,
            neg_fb_pct, pos_fb_pct, neg_txts, pos_txts
        FROM removed_pages_aspects
        LEFT JOIN mst_paths USING (page_id)
        LEFT JOIN temp.themes USING (page_id)
        LEFT JOIN temp.per_metrics USING (page_id)
        LEFT JOIN temp.per_feedback USING (page_id)
        ORDER BY page_id, timestamp
        '''
    data = []
    # Calculate and insert feedback data into the query result
    for cells in mst_conn.execute(qry).fetchall():
        cells = list(cells)
        # Reformat or delete textual feedback
        if incl_txt_fb:
            if cells[31]:
                cells[31] = cell_feedbacks(prep_feedbacks(cells[31]))
            if cells[32]:
                cells[32] = cell_feedbacks(prep_feedbacks(cells[32]))
        else:
            del cells[31:33]
        data.append(cells)
    wb.add_datasheet('Removed pages', data, incl_txt_fb)


def page_themes() -> None:
    """
    **Create view with theme of each page.**

    A new temporary view `themes` is created with the fields `page_id` and
    `theme`. The value of `theme` is deducted from the page path according to
    the following patterns (in which `<lc>` is a two-letter language code):

    - /bldcontent`<lc>`/berichten/<theme>
    - /bldcontent`<lc>`/themaoverstijgend/<theme>
    - /bldcontent`<lc>`/belastingdienst/<segment>/<theme>
    - /bldcontent`<lc>`/standaard_functies/<segment>/<theme>
    - /bldcontent`<lc>`/events/<segment>/<theme>
    - /bldcontent`<lc>`/niet_in_enig_menu/<segment>/<theme>
    - /bldcontent`<lc>`/campagnes/landingspaginas/<segment>/<theme>
    - /bldsysteem/... : theme 'systeem'

    Since the view does not depend on scrape or period, it will not be
    replaced when it is available already.

    The view works upon the scrapes database, which is connected via the
    global master connection `mst_conn`.
    """

    mst_conn.execute('''
        CREATE TEMPORARY VIEW IF NOT EXISTS themes AS
        WITH
            -- CTE to extract the first five segments from the page path
            one_segment AS (
                SELECT
                    page_id,
                    substr(path, 2, instr(substring(path, 2), '/')-1)
                        AS segment1,
                    substr(substring(path, 2),
                           instr(substring(path, 2), '/')+1) || '/' AS rest
                FROM mst_paths
            ),
            two_segments AS (
                SELECT
                    page_id, segment1,
                    substr(rest, 1, instr(rest, '/')-1) AS segment2,
                    substr(rest, instr(rest, '/')+1) AS rest
                FROM one_segment
            ),
            three_segments AS (
                SELECT
                    page_id, segment1, segment2,
                    substr(rest, 1, instr(rest, '/')-1) AS segment3,
                    substr(rest, instr(rest, '/')+1) AS rest
                FROM two_segments
            ),
            four_segments AS (
                SELECT
                    page_id, segment1, segment2, segment3,
                    substr(rest, 1, instr(rest, '/')-1) AS segment4,
                    substr(rest, instr(rest, '/')+1) AS rest
                FROM three_segments
            ),
            five_segments AS (
                SELECT
                    page_id, segment1, segment2, segment3, segment4,
                    substr(rest, 1, instr(rest, '/')-1) AS segment5
                FROM four_segments
            )
        -- Theme for urls like /<lc>/<theme>
        SELECT
            page_id, segment2 AS theme
        FROM five_segments
        WHERE segment1 LIKE '__'
        UNION
        -- Theme for urls like:
        --    /bldcontent<lc>/berichten/<theme>
        --    /bldcontent<lc>/themaoverstijgend/<theme>
        SELECT
            page_id, segment3 AS theme
        FROM five_segments
        WHERE segment1 LIKE 'bldcontent__'
            AND segment2 IN ('berichten', 'themaoverstijgend')
        UNION
        -- theme for urls like:
        --    /bldcontent<lc>/belastingdienst/<segment>/<theme>
        --    /bldcontent<lc>/standaard_functies/<segment>/<theme>
        --    /bldcontent<lc>/events/<segment>/<theme>
        --    /bldcontent<lc>/niet_in_enig_menu/<segment>/<theme>
        SELECT
            page_id, segment4 AS theme
        FROM five_segments
        WHERE segment1 LIKE 'bldcontent__'
            AND segment2 IN ('belastingdienst', 'standaard_functies',
                             'events', 'niet_in_enig_menu')
        UNION
        -- theme for urls like:
        --    /bldcontent<lc>/campagnes/landingspaginas/<segment>/<theme>
        SELECT
            page_id, segment5 AS theme
        FROM five_segments
        WHERE segment1 LIKE 'bldcontent__'
            AND segment2 = 'campagnes'
            AND segment3 = 'landingspaginas'
        UNION
        -- theme 'systeem' for urls like:
        --    /bldsysteem 
        SELECT
            page_id, 'systeem' AS theme
        FROM four_segments
        WHERE segment1 = 'bldsysteem'
        ''')


def per_metrics(first_date: dt.date, last_date: dt.date) -> None:
    """
    **Create temporary view with metrics for all pages alive in period.**

    Arguments:

        first_date: first day of the period
        last_date: last day of the period

    A new temporary view `per_metrics` is created with the next values for
    all pages alive at some moment during the period:

    - `page_id`: the id of the page
    - `visits`: number of visits that included this page [INTEGER]
    - `views`: number of times this page has been requested [INTEGER]
    - `entries`: number of visits that started on this page [INTEGER]
    - `bounces`: number of visits that only viewed this page [INTEGER]
    - `exits`: number of visits that ended on this page [INTEGER]
    - `bounce_rate`: fraction of entries on this page that bounced
      [REAL: `bounces` / `entries`]
    - `exit_rate`: fraction of visits that that exited on this page
      [REAL: `exits` / `visits`]
    - `organic_entries`: number of entries that originated from an external
      search engine [INTEGER]
    - `organic_entry_rate`: fraction of visits that started on the page and
      originated from an external search engine
      [REAL: `organic_entries` / `visits`]
    - `call_visits`: number of visits that included this page as well as a page
      with calling information [INTEGER]
    - `call_rate`: fraction of visits hat included a page with calling
      information [REAL: `call_visits` / `visits`]

    An existing view with the same name will be overwritten.

    The view works upon the metrics database, which is connected via the
    global master connection `mst_conn`.
    """

    per_pages(first_date, last_date)
    mst_conn.execute('DROP VIEW IF EXISTS per_metrics')
    mst_conn.execute(f'''
        CREATE TEMPORARY VIEW per_metrics AS
        WITH
            -- Dates in period
            per_days AS (
                SELECT date_id
                FROM dates
                WHERE date >= '{first_date}' AND date <= '{last_date}'
            ),
            -- Available metrics
            per_basic_metrics AS (
                SELECT DISTINCT
                    page_id,
                    sum(nb_visits) AS visits,
                    sum(nb_hits) AS views,
                    sum(entry_nb_visits) AS entries,
                    sum(entry_bounce_count) AS bounces,
                    sum(exit_nb_visits) AS exits,
                    sum(organic_entries) AS organic_entries,
                    sum(call_visits) AS call_visits
                FROM daily
                WHERE date_id IN per_days
                GROUP BY page_id
            ),
            -- Pages alive in period with available basic metrics
            per_pages_metrics AS (
                SELECT
                    page_id,
                    ifnull(visits,0) AS visits,
                    ifnull(views,0) AS views,
                    ifnull(entries,0) AS entries,
                    ifnull(bounces,0) AS bounces,
                    ifnull(exits,0) AS exits,
                    ifnull(organic_entries,0) AS organic_entries,
                    ifnull(call_visits,0) AS call_visits
                FROM per_pages
                LEFT JOIN per_basic_metrics USING (page_id)
            )
        -- Pages alive in period with basic and calculated metrics
        SELECT
            *,
            ifnull(round(CAST(bounces AS REAL) / entries, 3), 0) AS bounce_rate,
            ifnull(round(CAST(exits AS REAL) / visits, 3), 0) AS exit_rate,
            ifnull(round(CAST(organic_entries AS REAL) / visits, 3), 0)
                AS organic_entry_rate,
            ifnull(round(CAST(call_visits AS REAL) / visits, 3), 0) AS call_rate
        FROM per_pages_metrics
        ''')


def per_pages(first_date: dt.date, last_date: dt.date) -> None:
    """
    **Create temporary view with page_id's of all pages alive in period.**

    Arguments:

        first_date: first day of the period
        last_date: last day of the period

    An existing view with the same name will be overwritten.

    The view works upon the metrics database, which is connected via the
    global master connection `mst_conn`.
    """

    mst_conn.execute('DROP VIEW IF EXISTS per_pages')
    mst_conn.execute(f'''
        CREATE TEMPORARY VIEW per_pages AS
        WITH
            -- All timestamps between last scrape at or before first-date and 
            -- first scrape after last-date. 
            period (per_ts) AS (
                SELECT timestamp
                FROM mst_scrapes
                WHERE timestamp >= (SELECT max(timestamp)
                                    FROM mst_scrapes 
                                    WHERE date <= '{first_date}')
                    AND timestamp <= (SELECT min(timestamp) 
                                      FROM mst_scrapes 
                                      WHERE date > '{last_date}')
            ),
            -- Pages alive values at each period timestamp
            pages_alive_value (per_ts, page_id, alive) AS (
                SELECT DISTINCT
                    per_ts, page_id,
                    last_value(alive) OVER (
                        PARTITION BY per_ts, page_id
                        ORDER BY timestamp
                        ROWS BETWEEN UNBOUNDED PRECEDING
                            AND UNBOUNDED FOLLOWING
                    )
                FROM his_pages_life, period
                WHERE timestamp <= per_ts
            )
        -- Pages alive at some moment in period
        SELECT DISTINCT page_id
        FROM pages_alive_value
        WHERE alive
        ''')


def per_feedback(first_date: dt.date, last_date: dt.date) -> None:
    """
    **Create temporary feedback data for all pages alive in period.**

    Arguments:

        first_date: first day of the period
        last_date: last day of the period

    A new temporary view `per_feedback` is created with the next values for
    all pages alive at some moment during the period:

    - `page_id`: the id of the page
    - `neg_fb_cnt`: number of negative feedbacks [INTEGER]
    - `pos_fb_cnt`: number of positive feedbacks [INTEGER]
    - `fb_cnt`: total number of feedbacks [INTEGER: `neg_fb_cnt` + `pos_fb_cnt`]
    - `neg_txts`: newline separated negative feedback texts [TEXT]
    - `pos_txts`: newline separated positive feedback texts [TEXT]
    - `neg_fb_pct`: percentage of all feedbacks that are negative
      [REAL: `neg_fb_cnt` / `fb_cnt`]
    - `pos_fb_pct`: percentage of all feedbacks that are positive
      [REAL: `pos_fb_cnt` / `fb_cnt`]

    An existing view with the same name will be overwritten.

    The view works upon the metrics database, which is connected via the
    global master connection `mst_conn`.
    """

    per_pages(first_date, last_date)
    mst_conn.execute('DROP VIEW IF EXISTS per_feedback')
    mst_conn.execute(f'''
        CREATE TEMPORARY VIEW per_feedback AS
        WITH
            -- Dates in period
            per_days AS (
                SELECT date_id
                FROM dates
                WHERE date >= '{first_date}' AND date <= '{last_date}'
            ),
            -- Available feedback data
            per_basic_feedback AS (
                SELECT
                    page_id,
                    sum(neg_cnt) AS neg_fb_cnt,
                    sum(pos_cnt) AS pos_fb_cnt,
                    sum(neg_cnt + pos_cnt) AS fb_cnt,
                    group_concat(neg_txt, char(10)) AS neg_txts,
                    group_concat(pos_txt, char(10)) AS pos_txts
                FROM feedback
                WHERE date_id IN per_days
                GROUP BY page_id
            ),
            -- Pages alive in period with available basic feedback data
            per_pages_feedback AS (
                SELECT
                    page_id,
                    ifnull(neg_fb_cnt,0) AS neg_fb_cnt,
                    ifnull(pos_fb_cnt,0) AS pos_fb_cnt,
                    ifnull(fb_cnt,0) AS fb_cnt,
                    neg_txts, pos_txts
                FROM per_pages
                LEFT JOIN per_basic_feedback USING (page_id)
            )
        -- Pages alive in period with basic and calculated feedback data
        SELECT
            *,
            ifnull(CAST (neg_fb_cnt AS REAL) / fb_cnt,0) AS neg_fb_pct,
            ifnull(CAST (pos_fb_cnt AS REAL) / fb_cnt,0) AS pos_fb_pct
        FROM per_pages_feedback
        ''')


def prep_feedbacks(combined_fbs: str) -> list[str]:
    r"""
    **Prepare textual feedbacks.**

    Arguments:

        combined_fbs: newline concatenated feedbacks

    Returns:

        prepared feedbacks

    The feedbacks are deduplicated, cleansed and filtered according to the
    next rules:

    - leading and trailing whitespace and / (converted hard return) is
      removed
    - leading ! , . / < = > \ ) ] ^ _ ` ~ is removed
    - feedback consisting of only digits, punctuation and/or whitespace is
      discarded
    - feedback with only one kind of character is discarded
    - non-printable characters are removed
    - feedback containing any of the next texts is discarded
        - `<script>`
        - `</script>`
        - `javascript`

    The resulting set is lexicographically sorted with feedbacks starting
    with an alphabet character before the others.
    """

    def feedback_sorting_key(txt: str) -> str:
        """
        **Helper function to sort user feedback texts.**

        Arguments:

            txt: text item to be sorted

        Returns:

            modified input for sorting purposes

        Converts the input to lowercase and prepends it with ~ (the ascii
        character that sorts last) if the first character is non-alphabetic.
        """

        txt = txt.lower()
        if txt[0] not in string.ascii_letters:
            txt = '~' + txt
        return txt

    non_starters = r'!,./<=>\)]^_`~'
    forbidden_text = {'<script>', '</script>', 'javascript'}
    fbs = []
    for fb in list(set(combined_fbs.split('\n'))):
        fb = fb.lstrip(non_starters + string.whitespace)
        fb = fb.rstrip(string.whitespace + '/')
        if not fb.strip(string.punctuation + string.digits + string.whitespace):
            continue
        if len(set(fb.lower())) == 1:
            continue
        if not fb.isprintable():
            fb = ''.join(c for c in fb if c.isprintable())
        for text in forbidden_text:
            if text in fb:
                fb = ''
        if fb:
            fbs.append(fb)

    return sorted(fbs, key=feedback_sorting_key)


def cell_feedbacks(feedbacks: list[str]) -> str:
    """
    **Prepare textual feedbacks for worksheet cell.**

    Arguments:

        feedbacks: output from `prep_feedback` function

    Returns:

        feedbacks to fit one cell

    Newline concatenated feedbacks, potentially truncated, to fit one cell of
    a report sheet.
    """

    fbs = '\n'.join(feedbacks)
    if len(fbs) > _CELL_MAX:
        fbs = fbs[:fbs.rfind('\n', 0, _CELL_MAX)]
        shown_fb = fbs.count('\n') + 1
        fbs = f'[{shown_fb} of {len(feedbacks)} shown]\n' + fbs
    return fbs


def new_pages_sheet(wb: ReportWorkbook,
                    rep_ts: str, ref_ts: str, incl_txt_fb: bool = True) -> None:
    """
    **Add a report sheet with data of the new pages.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period
        incl_txt_fb: if False, no textual feedback will be included

    This sheet lists all relevant available and calculated data for each page
    that was added since the reference scrape.
    """

    # Set (inclusive) dates for the reporting period
    rep_end = ts_to_d(rep_ts, -1)
    rep_start = ts_to_d(ref_ts) if ref_ts else rep_end

    # Focus on specific scrape data and create temporary views
    with Scrape(rep_ts):
        page_themes()
        per_metrics(rep_start, rep_end)
        per_feedback(rep_start, rep_end)

        qry = f'''
            WITH
                -- Period timestamps
                period (per_ts) AS (
                    SELECT timestamp
                    FROM mst_scrapes
                    WHERE timestamp > '{ref_ts}'
                        AND timestamp <= '{rep_ts}'
                ),
                -- Pages alive values at each period timestamp
                pages_alive_value (per_ts, page_id, alive) AS (
                    SELECT DISTINCT
                        per_ts, page_id,
                        last_value(alive) OVER (
                            PARTITION BY per_ts, page_id
                            ORDER BY timestamp
                            ROWS BETWEEN UNBOUNDED PRECEDING
                                AND UNBOUNDED FOLLOWING
                        )
                    FROM his_pages_life, period
                    WHERE timestamp <= per_ts
                ),
                -- Pages alive at each period timestamp
                living_pages (per_ts, page_id) AS (
                    SELECT per_ts, page_id
                    FROM pages_alive_value
                    WHERE alive
                ),
                -- Titles of living pages at each period timestamp
                pages_titles (per_ts, page_id, title) AS (
                    SELECT DISTINCT
                        per_ts, page_id,
                        last_value(title) OVER (
                            PARTITION BY per_ts, page_id
                            ORDER BY timestamp
                            ROWS BETWEEN UNBOUNDED PRECEDING
                                AND UNBOUNDED FOLLOWING
                        )
                    FROM living_pages
                    LEFT JOIN his_pages_info USING (page_id)
                    WHERE timestamp <= per_ts
                ),
                -- Title frequency at each period timestamp
                title_freq (per_ts, page_id, freq) AS (
                    SELECT
                        per_ts, page_id,
                        count(page_id) OVER (
                            PARTITION BY per_ts, title
                            ROWS BETWEEN UNBOUNDED PRECEDING
                                AND UNBOUNDED FOLLOWING
                        )
                    FROM pages_titles
                ),
                -- Pages added in the period with the timestamp of addition
                page_additions (page_id, add_ts) AS (
                    SELECT page_id, timestamp
                    FROM his_pages_life
                    WHERE timestamp IN period AND alive
                ),
                -- Info timestamp of the added pages
                added_pages_info_ts (page_id, add_ts, info_ts) AS (
                    SELECT DISTINCT
                        page_id, add_ts,
                        last_value(timestamp) OVER (
                            PARTITION BY page_id, add_ts
                            ORDER BY timestamp
                            ROWS BETWEEN UNBOUNDED PRECEDING
                                AND UNBOUNDED FOLLOWING
                        )
                    FROM page_additions
                    LEFT JOIN his_pages_info USING (page_id)
                    WHERE timestamp <= add_ts
                ),
                -- All aspects of the added pages, including title frequency
                added_pages_aspects AS (
                    SELECT add_ts, freq, i.*
                    FROM added_pages_info_ts AS a
                    LEFT JOIN his_pages_info AS i
                        ON a.page_id = i.page_id AND a.info_ts = i.timestamp
                    LEFT JOIN title_freq AS t
                        ON a.page_id = t.page_id AND a.add_ts = t.per_ts
                )
            SELECT
                add_ts, page_id, path, 
                title, freq, description,
                iif(description ISNULL, 0, length(description)) AS descr_len,
                first_h1, num_h1s, language, modified, pagetype,
                classes, theme, business, category,
                ifnull(referral_cnt, iif(unl_type = 'orphan', '0!', 0))
                    AS num_ed_refs,
                ed_text, aut_text,
                visits, views, entries, bounces, bounce_rate, exits, exit_rate,
                organic_entries, organic_entry_rate, call_visits, call_rate,
                neg_fb_cnt, pos_fb_cnt,
                ifnull(CAST (neg_fb_cnt + pos_fb_cnt AS REAL) / visits,0)
                    AS fb_rate,
                neg_fb_pct, pos_fb_pct, neg_txts, pos_txts
            FROM added_pages_aspects
            LEFT JOIN mst_paths USING (page_id)
            LEFT JOIN temp.themes USING (page_id)
            LEFT JOIN tsd_int_ref_cnt USING (page_id)
            LEFT JOIN tsd_unlinked USING (page_id)
            LEFT JOIN temp.per_metrics USING (page_id)
            LEFT JOIN temp.per_feedback USING (page_id)
            ORDER BY page_id, add_ts
            '''
        data = []
        # Calculate and insert extra data into the query results
        for cells in mst_conn.execute(qry).fetchall():
            cells = list(cells)
            # Reformat or delete textual feedback
            if incl_txt_fb:
                if cells[35]:
                    cells[35] = cell_feedbacks(prep_feedbacks(cells[35]))
                if cells[36]:
                    cells[36] = cell_feedbacks(prep_feedbacks(cells[36]))
            else:
                del cells[35:37]
            # Insert number of words in editorial content
            ed_text = cells[17]
            ed_wrd_cnt = len(re.findall(r'\w+', ed_text))
            cells.insert(18, ed_wrd_cnt)
            # Insert number of words in first h1
            first_h1 = cells[7]
            words_h1 = 0 if not first_h1 else len(re.findall(r'\w+', first_h1))
            cells.insert(8, words_h1)
            # Insert url name and path quality
            path = cells[2]
            name_q, path_q = url_quality(path)
            cells.insert(3, path_q / 100)
            cells.insert(3, name_q / 100)
            data.append(cells)
    wb.add_datasheet('New pages', data, incl_txt_fb)


def url_quality(url: str) -> (int, int):
    r"""**Assess the quality of the path component of a url.**

    Arguments:

        url: complete url or path component to be assessed

    Returns:

        quality of the name, quality of ancestral path

    Both the name and the complete ancestral path of the url are assessed,
    returning two scores on a scale of 0 to 100.

    An optimal url path consists of lower case characters, numbers and
    hyphens only. A name or ancestral path complying with this, will get a
    score of 100. Any deviation lowers this score by multiplying it with a
    factor that depends on the number and kind of deviations. Four groups of
    deviating characters are distinguished:

    - unsafe characters: blank space and " < > % { } | \ ^ `
    - reserved characters: / ? # ] [ @ ! $ & ' ( ) * + , : ; =
    - suboptimal characters: . _ ~
    - capitals

    Fractions used when encountering one or more of these characters are:

    - unsafe: one 0.50 / more 0.40
    - reserved: one 0.65 / more 0.50
    - suboptimal: one 0.70 / more 0.60
    - capitals: one 0.90 / more 0.80
    """

    def assess_q(part: str) -> int:
        """**Helper function to assess the quality of a part of a url path.**

        Arguments:

            part: url part to be assessed

        Returns:

            quality score on a 0-100 scale
        """
        one_unsafe = 0.50
        more_unsafe = 0.40
        one_reserved = 0.65
        more_reserved = 0.50
        one_subopt = 0.70
        more_subopt = 0.60
        one_caps = 0.90
        more_caps = 0.80
        unsafe_chars = re.compile(r' "<>%{}|\^`')
        reserved_chars = re.compile(r'[:/?#\[\]@!$&\'()*+,;=]')
        subopt_chars = re.compile('[._~]')
        caps = re.compile('[A-Z]')

        q = 100
        unsafe_match = unsafe_chars.findall(part)
        if unsafe_match:
            q *= one_unsafe if len(unsafe_match) == 1 else more_unsafe
        reserved_match = reserved_chars.findall(part)
        if reserved_match:
            q *= one_reserved if len(reserved_match) == 1 else more_reserved
        subopt_match = subopt_chars.findall(part)
        if subopt_match:
            q *= one_subopt if len(subopt_match) == 1 else more_subopt
        caps_match = caps.findall(part)
        if caps_match:
            q *= one_caps if len(caps_match) == 1 else more_caps
        return int(q)

    path = urlparse(url).path
    ancestral_path, name = path.rsplit('/', 1)

    # Assess the quality of the final part of the url
    name_q = assess_q(name)

    # Assess the quality of the ancestral path (path without name)
    ancestors = ancestral_path.split('/')
    if ancestors[0] == '':
        ancestors.pop(0)
    if ancestors[-1] == name:
        # Do not assess the ancestors for a double name at the end
        ancestors.pop()
    anc_string = ''.join(ancestors)
    anc_q = assess_q(anc_string)

    return name_q, anc_q


def changed_aspects_sheet(wb: ReportWorkbook,
                          rep_ts: str, ref_ts: str) -> None:
    """
    **Add a report sheet with page changes.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period

    The changes are ordered per page and per aspect that changed within the
    reporting period. For each aspect change, the previous and the new value
    are listed, together with the timestamps of the scrapes that registered
    these values. Changes in textual aspects also list a modification factor,
    which is a measure of the difference between the two values. Refer to the
    documentation of the `mod_factor` function for further information
    about this factor.
    """

    qry = 'SELECT name FROM pragma_table_info("his_pages_info")'
    aspects = [row[0] for row in mst_conn.execute(qry).fetchall()
               if row[0] not in ('timestamp', 'page_id')]
    data = []
    for aspect in aspects:
        # Get values and modification factor of changes in aspect
        qry = f'''
            WITH
                -- Timestamps within the reporting period
                period AS (
                    SELECT timestamp
                    FROM mst_scrapes
                    WHERE timestamp > '{ref_ts}'
                        AND timestamp <= '{rep_ts}'
                ),
                aspect_changes AS (
                    SELECT
                        page_id, timestamp,
                        iif(lag({aspect}) OVER win = {aspect}, NULL, {aspect})
                            AS aspect
                    FROM his_pages_info
                    WHERE timestamp <= '{rep_ts}'
                    WINDOW win AS (PARTITION BY page_id ORDER BY timestamp)
                ),
                -- Changes in the relevant aspect before or in period
                aspect_hist AS (
                    SELECT *
                    FROM aspect_changes
                    WHERE aspect NOTNULL
                ),
                -- Aspect changes with previous values
                change_pairs AS (
                    SELECT
                        page_id,
                        lag(timestamp) OVER win AS old_ts,
                        lag(aspect) OVER win AS old_value,
                        timestamp AS new_ts,
                        aspect AS new_value
                    FROM aspect_hist
                    WINDOW win AS (PARTITION BY page_id ORDER BY timestamp)
                )
            SELECT page_id, path, business, language, pagetype,
                '{aspect}' AS aspect,
                new_value, new_ts,
                old_value, old_ts
            FROM change_pairs AS c
            LEFT JOIN mst_paths USING (page_id)
            LEFT JOIN his_pages_info AS i USING (page_id) 
            WHERE old_ts NOTNULL AND new_ts IN period
                AND c.new_ts = i.timestamp
            ORDER BY page_id, old_ts
            '''
        for cells in mst_conn.execute(qry).fetchall():
            cells = list(cells)
            # Calculate and insert modification factor
            if aspect in ('title', 'description', 'first_h1',
                          'ed_text', 'aut_text'):
                new_txt, old_txt = cells[6], cells[8]
                mf = mod_factor(old_txt, new_txt)
            else:
                mf = None
            cells.insert(8, mf)
            data.append(cells)

    # Sort changes on page_id, aspect, timestamp of new value
    data.sort(key=itemgetter(0, 5, 7))
    wb.add_datasheet('Changed aspects', data, shading='cluster')


def mod_factor(ref_text: str, act_text: str) -> float:
    """
    **Calculate the modification factor of a text string.**

    Arguments:

        ref_text: text acting as reference
        act_text: actual text to compare against the reference

    Returns:

        modification factor in the range of 0 to 1

    The returned value is a measure of the difference between two texts on a
    scale from 0 (texts are exactly equal) to 1 (texts are completely
    different). The value is calculated as `1 - (SR1 + SR2)/2`, where `SR`
    stands for the similarity ratio as defined in the Python standard
    `difflib` module. `SR1` represents the similarity of both texts. `SR2` is
    the similarity of the sorted set of words from both texts. Averaging
    these ratios has the effect that changes in both wording and phrasing are
    distinguished from changes in phrasing or wording only.
    """

    sm = difflib.SequenceMatcher(a=ref_text, b=act_text)
    lib_ratio = sm.ratio()
    sm.set_seq1(a=sorted(list(set(ref_text.split()))))
    sm.set_seq2(b=sorted(list(set(act_text.split()))))
    set_ratio = sm.ratio()
    return 1 - (lib_ratio + set_ratio) / 2


def all_pages_sheet(wb: ReportWorkbook,
                    rep_ts: str, ref_ts: str, incl_txt_fb: bool = True) -> None:
    """
    **Add a report sheet with data of all pages.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period
        incl_txt_fb: if False, no textual feedback will be included

    This sheet lists the relevant available and calculated data for all pages
    alive at the reporting timestamp `rep_ts`.
    """

    # Set (inclusive) dates for the reporting period
    rep_end = ts_to_d(rep_ts, -1)
    rep_start = ts_to_d(ref_ts) if ref_ts else rep_end

    # Focus on specific scrape data and create temporary views
    with Scrape(rep_ts):
        page_themes()
        per_metrics(rep_start, rep_end)
        per_feedback(rep_start, rep_end)

        qry = '''
            WITH
                scr_title_freq AS (
                    SELECT
                        title,
                        count(*) AS title_freq
                    FROM tsd_pages_info
                    GROUP BY title
                )
            SELECT
                page_id, path,
                title, title_freq, description,
                iif(description ISNULL, 0, length(description)) AS descr_len,
                first_h1,  num_h1s, language, modified, pagetype,
                classes, theme, business, category,
                ifnull(referral_cnt, iif(unl_type = 'orphan', '0!', 0))
                    AS num_ed_refs,
                ed_text, aut_text,
                visits, views, entries, bounces, bounce_rate, exits, exit_rate,
                organic_entries, organic_entry_rate, call_visits, call_rate,
                neg_fb_cnt, pos_fb_cnt,
                ifnull(CAST (neg_fb_cnt + pos_fb_cnt AS REAL) / visits,0)
                    AS fb_rate,
                neg_fb_pct,  pos_fb_pct, neg_txts, pos_txts
            FROM tsd_pages_info
            LEFT JOIN mst_paths USING (page_id)
            LEFT JOIN scr_title_freq USING (title)
            LEFT JOIN temp.themes USING (page_id)
            LEFT JOIN tsd_int_ref_cnt USING (page_id)
            LEFT JOIN tsd_unlinked USING (page_id)
            LEFT JOIN temp.per_metrics USING (page_id)
            LEFT JOIN temp.per_feedback USING (page_id)
            ORDER BY visits DESC
            '''
        data = []
        # Calculate and insert extra data into the query results
        for cells in mst_conn.execute(qry).fetchall():
            cells = list(cells)
            # Reformat or delete textual feedback
            if incl_txt_fb:
                if cells[34]:
                    cells[34] = cell_feedbacks(prep_feedbacks(cells[34]))
                if cells[35]:
                    cells[35] = cell_feedbacks(prep_feedbacks(cells[35]))
            else:
                del cells[34:36]
            # Insert number of words in editorial content
            ed_text = cells[16]
            ed_wrd_cnt = len(re.findall(r'\w+', ed_text))
            cells.insert(17, ed_wrd_cnt)
            # Insert number of words in first h1
            first_h1 = cells[6]
            words_h1 = 0 if not first_h1 else len(re.findall(r'\w+', first_h1))
            cells.insert(7, words_h1)
            # Insert url name and path quality
            path = cells[1]
            name_q, path_q = url_quality(path)
            cells.insert(2, path_q / 100)
            cells.insert(2, name_q / 100)
            data.append(cells)
    wb.add_datasheet('All pages', data, incl_txt_fb)


def all_feedback_sheet(wb: ReportWorkbook, rep_ts: str, ref_ts: str) -> None:
    """
    **Add a report sheet with textual feedback of all pages.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period

    This sheet lists available textual feedback for pages alive at the
    reporting timestamp `rep_ts`.
    """

    # Set (inclusive) dates for the reporting period
    rep_end = ts_to_d(rep_ts, -1)
    rep_start = ts_to_d(ref_ts) if ref_ts else rep_end

    # Focus on specific scrape data and create temporary views
    with Scrape(rep_ts):
        per_feedback(rep_start, rep_end)
        qry = '''
            SELECT
                page_id, path, title, neg_txts, pos_txts
            FROM tsd_pages_info
            LEFT JOIN mst_paths USING (page_id)
            LEFT JOIN temp.per_feedback USING (page_id)
            WHERE neg_txts NOTNULL OR pos_txts NOTNULL
            ORDER BY fb_cnt DESC'''
        data = []
        # Form separate rows for each distinct page/feedback combination
        for page_id, path, title, neg_txts, pos_txts in \
                mst_conn.execute(qry).fetchall():
            if neg_txts:
                for txt in prep_feedbacks(neg_txts):
                    data.append((page_id, path, title, 'neg',
                                 int(txt) if txt.isdigit() else txt))
            if pos_txts:
                for txt in prep_feedbacks(pos_txts):
                    data.append((page_id, path, title, 'pos',
                                 int(txt) if txt.isdigit() else txt))
    wb.add_datasheet('All feedback', data, shading='cluster', cluster_col=3)


def ed_links_sheet(wb: ReportWorkbook, rep_ts: str) -> None:
    """
    **Add a report sheet with editorial links.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period

    This sheet lists all links that were found in the editorial texts of all
    pages alive at the reporting timestamp `rep_ts`.
    """

    with Scrape(rep_ts):
        qry = f'''
            SELECT
                pp.page_id, pp.path, pagetype, text, link_id,
                CASE
                    WHEN l.url IS NULL
                    THEN lp.path
                    ELSE CASE
                             WHEN l.url LIKE '{ROOT_URL}%'
                             THEN replace(l.url, '{ROOT_URL}', '')
                             ELSE l.url
                         END
                END ||
                CASE
                    WHEN l.anchor IS NULL
                    THEN ''
                    ELSE '#' || l.anchor
                END AS link,
                lp.page_id, status
            FROM tsd_ed_links
            LEFT JOIN mst_paths AS pp USING (page_id)
            LEFT JOIN tsd_pages_info USING (page_id)
            LEFT JOIN mst_links AS l USING (link_id)
            LEFT JOIN mst_paths AS lp ON l.page_id = lp.page_id
            LEFT JOIN tsd_links USING (link_id)
            ORDER BY pp.page_id, text, link_id
            '''
        data = mst_conn.execute(qry).fetchall()
    wb.add_datasheet('Editorial links', data, shading='cluster')


def redirs_aliases_sheet(
        wb: ReportWorkbook, rep_ts: str) -> None:
    """
    **Add a report sheet with editorial links.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period

    This sheet lists all redirects and alias urls, together with their final
    wcm paths, that were encountered during the scrape with timestamp `rep_ts`.
    """

    with Scrape(rep_ts):
        qry = '''
            SELECT
                redir_type, req_id,
                CASE
                    WHEN req_id IS NULL
                    THEN req_url
                    ELSE req.path
                END AS req,
                redir_id,
                CASE
                    WHEN redir_id IS NULL
                    THEN redir_url
                    ELSE redir.path
                END AS redir
            FROM tsd_redirs AS r
            LEFT JOIN mst_paths AS req ON r.req_id = req.page_id
            LEFT JOIN mst_paths AS redir ON r.redir_id = redir.page_id
            ORDER BY req
            '''
        data = mst_conn.execute(qry).fetchall()
        # Convert redir type to int if appropriate
        data = [(int(t[0]) if t[0].isdigit() else t[0], *t[1:]) for t in data]
    wb.add_datasheet('Redirects and aliases', data)


def downloads_sheet(wb: ReportWorkbook, rep_ts: str, ref_ts: str) -> None:
    """
    **Add a report sheet with all downloads details.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period

    The sheet contains all downloads that were used during the period,
    along with the download details, metrics and referring pages. Codes in
    the naming of the next classes of downloads are extracted and given in
    the various fields of the sheet.

    - *Documents* - Descriptions are taken from the text anchors of the download
      links. Further details stem from the codes that are part of the filename,
      which are standardised by and used in the GDB document management system,
      that acts as repository for these files. The actual document codes deviate
      in various ways from the standard. The decoding algorithm catches as much
      non-standard codes as possible. Downloads that fail this approach are
      classified as documents with 'malformed code'.
    - *Payroll tax tables* - The algorithm that is used to deduce the
      description from the filename, has been reverse engineered from existing
      downloads. Further details are not available.
    - *Rulings* - Descriptions are taken from the text anchors of the download
      links. The coding of the filenames is broken down to some of the fields,
      but does not provide more information than the descriptions.
    """

    # Create and fill temporary dl_links table
    mst_conn.execute('DROP TABLE IF EXISTS dl_links')
    mst_conn.execute('''
        CREATE TEMPORARY TABLE dl_links (
            link_id	    INTEGER,
            filename	TEXT NOT NULL,
            dl_grp	    TEXT NOT NULL,
            dl_type		TEXT,
            descr		TEXT,
            doc_code	TEXT,
            subgroup	TEXT,
            seq_nr		TEXT,
            funct		INTEGER,
            add_typ		TEXT,
            yr_ver		TEXT,
            form		TEXT,
            ctry_lan	TEXT,
            FOREIGN KEY(link_id) REFERENCES mst_links,
            PRIMARY KEY(link_id)
        )
        ''')
    # Pattern for the well-formed doc code (allowing some deviations)
    patt_wf_doc_code = \
        r'(?P<subgroup>[a-z]{1,5})' \
        r'(?P<seq_nr>\d[a-z\d]{1,3}?[a-z\d]?)' \
        r'(?P<e_doc>[e]?)' \
        r'[-_]?' \
        r'(?P<funct>\d)' \
        r'(?P<add_typ>[beiotz])?' \
        r'(?P<yr_ver>[-\*\d]?\d{1,2})' \
        r'-?(?P<form>ed|fd|fol|hm|ke|odt|pl)' \
        r'-?' \
        r'(?P<ctry_lan>[a-z]{1,})?' \
        r'(?=\.)'
    # Pattern to search a doc code that is malformed or incorrect
    patt_mf_doc_code = \
        r'[-_]' \
        r'(?P<doc_code>' \
        r'(?P<subgroup>[a-z]{1,3})' \
        r'[-_]?' \
        r'\d{1,3}' \
        r'[a-z\d_-]*' \
        r'[a-z]{2}' \
        r')' \
        r'\.'
    # Pattern for payroll tax tables
    patt_pr_table = \
        r'(?P<colour>wit|groen)_' \
        r'(?P<table>kw|mnd|4?wk|dag|bb)_' \
        r'(?P<country>nl|be|sa|lk|dl)?_?' \
        r'(?P<sit_conv>std|ag|bh|ci|dj|ek|fl)_?' \
        r'(?P<liability>alg|svb)?_?' \
        r'(?P<vouchers>vb19min|vb19meer)?_' \
        r'[01]{0,4}' \
        r'(?P<year>\d{4})' \
        r'[01]{0,4}.' \
        r'(?P<format>pdf|xlsx)'
    # Pattern for rulings documents
    patt_ruling = \
        r'rul-' \
        r'(?P<date>(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2}))-' \
        r'(?P<subgroup>.*)-' \
        r'(?P<seq_nr>\d*).pdf'
    # Select download links
    sel_qry = f'''
        SELECT link_id, substr(url, {len(DL_SITE)+1}) AS dl_path
        FROM mst_links
        WHERE url LIKE '{DL_SITE}/%'
        '''
    values = []
    for link_id, dl_path in mst_conn.execute(sel_qry).fetchall():
        filename = Path(dl_path).name.lower()
        dl_grp = Path(dl_path).parts[1]
        descr = doc_code = subgroup = seq_nr = funct = None
        add_typ = yr_ver = form = ctry_lan = None
        if dl_match := re.match(patt_pr_table, filename):
            dl_type = 'Payroll tax table'
            descr = ''
            # <colour>wit|groen
            match dl_match['colour']:
                case 'wit': descr += 'Witte'
                case 'groen': descr += 'Groene'
            # <table>kw|mnd|4?wk|dag|bb
            txt = ''
            match dl_match['table']:
                case 'kw': txt = ' Kwartaal<lp>'
                case 'mnd': txt = ' Maand<lp>'
                case '4wk': txt = ' Vierweken<lp>'
                case 'wk': txt = ' Week<lp>'
                case 'dag': txt = ' Dag<lp>'
                case 'bb': txt = ' Jaar<bb>'
            txt = txt.replace(
                '<lp>', 'loon-tabel loonbelasting/premie volksverzekeringen')
            txt = txt.replace(
                '<bb>', 'loon-tabel bijzondere beloningen')
            descr += txt
            # <vouchers>vb19min|vb19meer
            if dl_match['vouchers']:
                txt = ' met vakantiebonnen voor <nb> vakantiedagen per jaar'
            match dl_match['vouchers']:
                case 'vb19min': descr += txt.replace('<nb>', '19 of minder')
                case 'vb19meer': descr += txt.replace('<nb>', '20 of meer')
            # <country>nl|be|sa|lk|dl
            match dl_match['country']:
                case 'nl': descr += ' Nederland'
                case 'be': descr += ' Belgiƫ'
                case 'sa': descr += ' Suriname of Aruba'
                case 'lk': descr += ' Land van de landenkring'
                case 'dl': descr += ' Derde land'
            # <sit_conv>std|ag|bh|ci|dj|ek|fl
            match dl_match['sit_conv']:
                case 'std': descr += \
                    ', Standaard'
                case 'ag': descr += \
                    ', A - Uitsluitend premieplichtig AOW, ANW en WL'
                case 'bh': descr += \
                    ', B - Uitsluitend belastingplichtig'
                case 'ci': descr += \
                    ', C - Belastingplichtig en premieplichtig AOW en AN'
                case 'dj': descr += \
                    ', D - Belastingplichtig en premieplichtig WL'
                case 'ek': descr += \
                    ', E - Uitsluitend premieplichtig AOW en ANW'
                case 'fl': descr += \
                    ', F - Uitsluitend premieplichtig WL'
            # <liability>alg|svb
            match dl_match['liability']:
                case 'alg': pass
                case 'svb': descr += ', SVB'
            # <year>\d{4}
            if year := dl_match['year']:
                descr += f' (uitgave januari {year})'
        elif dl_match := re.search(patt_wf_doc_code, filename):
            dl_type = 'Document'
            doc_code = dl_match[0]
            subgroup = dl_match['subgroup']
            seq_nr = dl_match['seq_nr']
            funct = dl_match['funct']
            add_typ = dl_match['add_typ']
            yr_ver = dl_match['yr_ver']
            form = dl_match['form']
            ctry_lan = dl_match['ctry_lan']
        elif dl_match := re.search(patt_mf_doc_code, filename):
            dl_type = 'Document (malformed code)'
            doc_code = dl_match['doc_code']
            subgroup = dl_match['subgroup']
        elif dl_match := re.match(patt_ruling, filename):
            dl_type = 'Ruling'
            subgroup = dl_match['subgroup']
            seq_nr = dl_match['seq_nr']
        else:
            dl_type = 'Non-standard download file'

        vals = (link_id, filename, dl_grp, dl_type, descr, doc_code,
                subgroup, seq_nr, funct, add_typ, yr_ver, form, ctry_lan)
        values.append(
            '(' + ', '.join([f'"{v}"' if v else 'NULL' for v in vals]) + ')')
    # Insert the gathered data with one big insert
    ins_qry = f'''
        INSERT INTO dl_links
            (link_id, filename, dl_grp, dl_type, descr, doc_code,
            subgroup, seq_nr, funct, add_typ, yr_ver, form, ctry_lan)
        VALUES
            {', '.join(values)}
        '''
    mst_conn.execute(ins_qry)

    # Set (inclusive) dates for the reporting period
    rep_end = ts_to_d(rep_ts, -1)
    rep_start = ts_to_d(ref_ts) if ref_ts else rep_end

    # Select and gather data to be reported
    qry = f'''
        WITH
            -- Dates in period
            per_days AS (
                SELECT date_id
                FROM dates
                WHERE date >= '{rep_start}' AND date <= '{rep_end}'
            ),
            -- Download metrics in period
            per_downloads AS (
                SELECT DISTINCT
                    link_id,
                    sum(nb_visits) AS uniq_downloads,
                    sum(nb_hits) AS downloads
                FROM downloads
                WHERE date_id IN per_days
                GROUP BY link_id
            ),
            -- Last presence of editorial download links at start of period
            last_presence_dl_edlinks AS (
                SELECT DISTINCT
                    page_id, link_id,
                    last_value(present) OVER win AS present
                FROM his_ed_links
                WHERE link_id IN (SELECT link_id FROM per_downloads)
                    AND timestamp <= '{ref_ts}'
                WINDOW win AS (
                    PARTITION BY page_id, link_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
                )
            ),
            -- Editorial download links in period
            per_dl_edlinks AS (
                -- Editorial download links at start of period
                SELECT DISTINCT page_id, link_id
                FROM last_presence_dl_edlinks
                WHERE present > 0
                    UNION
                -- Editorial download links that became present during period
                SELECT DISTINCT page_id, link_id
                FROM his_ed_links
                WHERE link_id IN (SELECT link_id FROM per_downloads)
                    AND timestamp > '{ref_ts}' AND timestamp <= '{rep_ts}'
                    AND present > 0
            ),
            -- Downloads with all separate pages referring to them
            per_dl_refs_sep AS (
                SELECT DISTINCT link_id, page_id
                FROM per_downloads
                LEFT JOIN per_dl_edlinks USING (link_id)
            ),
            -- Downloads with combined pages referring to them
            per_dl_refs_comb AS (
                SELECT 
                    link_id,
                    group_concat(page_id, ', ') AS ref_page_ids
                FROM per_dl_refs_sep
                GROUP BY link_id
            ),
            -- Last editorial download links before or at end of period
            -- (including links that are no longer present)
            last_dl_edlinks AS (
                SELECT DISTINCT
                    page_id, link_id,
                    last_value(text) OVER win AS text
                FROM his_ed_links
                WHERE link_id IN (SELECT link_id FROM per_downloads)
                    AND timestamp <= '{rep_ts}'
                WINDOW win AS (
                    PARTITION BY page_id, text, link_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
                )
            ),
            -- Timestamps of last pages info before or at end of period
            last_info_ts (page_id, timestamp) AS (
                SELECT DISTINCT page_id, last_value(timestamp) OVER win
                FROM his_pages_info
                WHERE timestamp <= '{rep_ts}'
                WINDOW win AS (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
                )
            ),
            -- Last pages info before or at end of period
            last_pages_info AS (
                SELECT *
                FROM mst_paths
                LEFT JOIN last_info_ts USING (page_id)
                LEFT JOIN his_pages_info USING (page_id, timestamp)
            ),
            -- Last editorial download link details before or at end of period
            last_dl_edlink_details AS (
                SELECT
                    link_id,
                    -- truncate text before first newline
                    iif(instr(text, char(10)),
                        substr(text, 1, instr(text, char(10))-1),
                        text) AS text,
                    replace(url, rtrim(url, replace(url, '/', '')), '') AS file,
                    language
                FROM last_dl_edlinks
                LEFT JOIN mst_links USING (link_id)
                LEFT JOIN last_pages_info USING (page_id)
            )
        SELECT DISTINCT
            link_id, filename, dl_grp, dl_type,
            replace(filename, rtrim(filename, replace(filename, '.', '')), '')
                AS file_type,
            ifnull(descr, first_value(text) OVER win) AS descr,
            doc_code, subgroup, seq_nr, funct, add_typ, yr_ver, form, ctry_lan,
            uniq_downloads, downloads, ref_page_ids
        FROM per_downloads
        LEFT JOIN last_dl_edlink_details USING (link_id)
        LEFT JOIN dl_links USING (link_id)
        LEFT JOIN per_dl_refs_comb USING (link_id)
        WINDOW win AS (
            PARTITION BY link_id
            -- preference for longest dutch text
            ORDER BY language DESC, length(text) DESC
            ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
        )
        ORDER BY downloads DESC
        '''
    data = mst_conn.execute(qry).fetchall()
    wb.add_datasheet('Downloads', data)


def paths_sheet(wb: ReportWorkbook, rep_ts: str, ref_ts: str) -> None:
    """
    **Add a report sheet with all page paths.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period

    This sheet lists the paths of all pages that were alive (at least at some
    moment) in the reporting period.
    """

    # Set (inclusive) dates for the reporting period
    rep_end = ts_to_d(rep_ts, -1)
    rep_start = ts_to_d(ref_ts) if ref_ts else rep_end

    per_pages(rep_start, rep_end)
    qry = f'''
        SELECT page_id, path
        FROM temp.per_pages
        LEFT JOIN mst_paths USING (page_id)
        ORDER BY page_id'''
    data = mst_conn.execute(qry).fetchall()
    wb.add_datasheet('Paths', data)
#   REPORT_SUBDIRS = {'day': 'day_reports', 'daily': 'daily_reports', 'weekly': 'weekly_reports', 'monthly': 'monthly_reports', 'one-monthly': 'monthly_reports', 'quarterly': 'three-monthly_reports', 'three-monthly': 'three-monthly_reports', 'half-yearly': 'six-monthly_reports', 'six-monthly': 'six-monthly_reports', 'yearly': 'twelve-monthly_reports', 'twelve-monthly': 'twelve-monthly_reports'}

Dictionary where reports will be stored, with report periods as keys and subdirectories as values.

#   class ReportWorkbook(xlsxwriter.workbook.Workbook):
View Source
class ReportWorkbook(xlsxwriter.Workbook):
    """
    **Workbook subclass for writing site reports.**

    Instantiating from this class creates a site report (workbook) with
    one sheet named 'Report'. This sheet contains the info that was given when
    instantiating, together with release notes of the site report.

    ***Instance methods:***

    - `get_fmt`: get a format to use when writing cells in a sheet
    - `add_datasheet`: add a sheet with tabular data
    - `add_buttons`: add navigation buttons to the 'Report' sheet
    - `close_and_publish`: call finalisation code, close file and publish it

    ***Instance attributes:***

    - `rep_fmts`: format library to be used when writing data to report sheets
      using the `add_datasheet` method; this library can be accessed using
      the `get_fmt` instance method

    The formats in the `rep_fmts` library can be accessed via three-level
    indexing with respective values for `num_format`, `alignment` and
    `shading`. While `shading` is boolean, the potential values for the
    `num_format` and `alignment` are given below, and can be used as values
    for the *cols* column specification in the report configuration file.

    ***Values for `num_format`***

    - 'std': 'General' format according to the spreadsheet application
    - 'id':  plain integer
    - 'int': integer with thousands separators
    - 'int-delta': +/- signed integer with thousands separators
    - 'flt3': float with three decimals
    - 'flt3-delta': +/- signed float with three decimals
    - 'pct0': percentage without decimals
    - 'pct2': percentage with two decimals
    - 'hdr': format for cells with column headers

    Two additional formats can be used that depend on the type of value to be
    written in a cell:

    - 'int/flt3': 'int' for integer, 'flt3' for float
    - 'int-delta/flt3-delta': 'int-delta' for integer, 'flt3-delta' for float

    ***Values for `alignment`***

    - 'std': default alignment according to the spreadsheet application
    - 'left': left aligned
    - 'center': center aligned
    - 'right': right aligned
    """

    def __init__(self, wb_path: Path, rep_info: list[tuple]) -> None:
        """
        **Create and initialise a report workbook.**

        Arguments:

            wb_path: path of the report workbook to be created
            rep_info: name/values pairs to be written to the 'Report' sheet

        Returns:

            instance of ReportWorkbook

        A new workbook is created with one sheet named 'Report', containing
        report information given via the `rep_info` parameter, and release
        notes.
        """
        super().__init__(wb_path, {'constant_memory': True})

        # Store all potential formats to write the sheets
        self.rep_fmts = {}
        f0 = {'border_color': report_conf.colour_brdr, 'left': 1, 'right': 1}
        for num_fmt in ('std', 'id', 'int', 'int-delta', 'flt3',
                        'flt3-delta', 'pct0', 'pct2', 'hdr'):
            self.rep_fmts[num_fmt] = {}
            f1 = {}
            match num_fmt:
                case 'std':
                    f1 = {'num_format': 'General'}
                case 'id':
                    f1 = {'num_format': '0'}
                case 'int':
                    f1 = {'num_format': '#,##0'}
                case 'int-delta':
                    f1 = {'num_format': '+#,##0;-#,##0;-'}
                case 'flt3':
                    f1 = {'num_format': '0.000'}
                case 'flt3-delta':
                    f1 = {'num_format': '0.000'}
                case 'pct0':
                    f1 = {'num_format': '0%'}
                case 'pct2':
                    f1 = {'num_format': '0.00%'}
                case 'hdr':
                    f1 = {'bold': True, 'font_color': 'white',
                          'fg_color': report_conf.colour_hdr_bg,
                          'border_color': 'white'}
            for align in ('std', 'left', 'center', 'right'):
                self.rep_fmts[num_fmt][align] = {}
                f2 = {} if align == 'std' else {'align': align}
                for shade in (False, True):
                    f3 = {'fg_color': report_conf.colour_shade} if shade else {}
                    self.rep_fmts[num_fmt][align][shade] = \
                        self.add_format(f0 | f1 | f2 | f3)

        # Insert 'Report' worksheet and add report information and release notes
        ws, (row, col) = self.add_datasheet('Report', rep_info,
                                            shading='number')
        max_length = max([len(n) for n in _REP_REL_NOTES])
        ws.insert_textbox(
            row + 3, 0, '\n'.join(_REP_REL_NOTES),
            {'x_offset': 20,
             'object_position': 3,
             'line': {'color': report_conf.colour_hdr_bg, 'width': 1},
             'fill': {'color': report_conf.colour_shade},
             'width': max_length * 6.4,
             'height': len(_REP_REL_NOTES) * 18 + 30,
             'align': {'vertical': 'middle'}}
        )

    def get_fmt(self, num_format: str, alignment: str = 'std',
                shaded: bool = False) -> xlsxwriter.workbook.Format:
        """
        **Get format to write report cells.**

        Arguments:

            num_format: specification of the required number format
            alignment: specification of the required alignment
            shaded: True if the cell should have a slightly darker background

        Returns:

            format to be used when writing worksheet cells

        Potential values of the `num_format` and `alignment` parameters are
        documented in the `ReportWorkbook` class.
        """
        return self.rep_fmts[num_format][alignment][shaded]

    def add_datasheet(self, sh_name: str, data: list[Union[tuple, list]],
                      incl_txt_fb: bool = True, shading: str = None,
                      cluster_col: int = 0, cluster_del: bool = False) \
            -> tuple[xlsxwriter.workbook.Worksheet, tuple[int, int]]:
        """
        **Add a data sheet to the report workbook.**

        Arguments:

            sh_name: name of the sheet
            data: two-dimensional data structure with rows of values
            incl_txt_fb: if False, columns with textual feedback will be skipped
            shading: method to shade rows (None, 'number' or 'cluster')
            cluster_col: zero indexed column number of the value used for
                shading
            cluster_del: do not write the cluster value to the sheet

        Returns:

            reference to the new worksheet and the last (row, col) position

        The configuration for the sheet is read from the report configuration
        file. The `sh_name` with which the sheet is created has to be present
        as first level key value in this configuration file.

        When `shading` is specified as 'number', the rows will get toggled
        background shading after one to three rows (depends on the total
        number of rows) for better readability. When `shading` is 'cluster',
        shading of rows will be toggled after the value in column
        `cluster_col` changes. This value will not be written to the sheet if
        `cluster_del` is True. As per default the sheet rows will have no
        shading.
        """

        # Get the specifications for this sheet
        try:
            with open(report_conf.rep_config_json) as f:
                sheet_spec = json.load(f)[sh_name]
        except KeyError:
            raise ValueError(
                f'no specifications available in {report_conf.rep_config_json} '
                f'for sheet named {sh_name}')

        col_specs = [s for s in sheet_spec['cols']
                     if incl_txt_fb or 'feedback txt' not in s[0]]

        # Create and configure the sheet
        sheet = self.add_worksheet(sh_name)
        for col, col_spec in enumerate(col_specs):
            sheet.write(0, col, col_spec[0], self.get_fmt('hdr'))
            sheet.set_column(col, col, col_spec[1])
        if sheet_spec['frozen_rows'] or sheet_spec['frozen_cols']:
            # With both zero, xlsxwriter delivers a partly corrupt xlsx
            sheet.freeze_panes(
                sheet_spec['frozen_rows'], sheet_spec['frozen_cols'])
        if sheet_spec['show_grid']:
            sheet.hide_gridlines(0)
        else:
            sheet.hide_gridlines(2)
        if sheet_spec['autofilter']:
            sheet.autofilter(0, 0, len(data), len(col_specs) - 1)

        # Write the data
        shaded = False
        shade_num = 0
        last_cluster = ''
        if shading == 'number':
            # Shading toggles any shade_num rows
            if len(data) <= 10:
                shade_num = 1
            elif len(data) < 20:
                shade_num = 2
            else:
                shade_num = 3
        row, col = 1, 0  # Initialize to avoid inspection warning
        for row, values in enumerate(data, start=1):
            if shading == 'number':
                # Toggle shading after `shade_num` rows
                shaded = row % (2 * shade_num) not in list(
                    range(1, shade_num + 1))
            elif shading == 'cluster':
                # Toggle shading after value of `cluster` changes
                if cluster_del:
                    values = list(values)
                    cluster = values.pop(cluster_col)
                else:
                    cluster = values[cluster_col]
                if not last_cluster:
                    last_cluster = cluster
                if cluster != last_cluster:
                    shaded = not shaded
                    last_cluster = cluster
            for col, value in enumerate(values):
                num_format, alignment = col_specs[col][2:4]
                if '/' in num_format:
                    for fmt in num_format.split('/'):
                        if ('flt' in fmt and type(value) == float) or \
                                ('int' in fmt and type(value) in (int, str)):
                            num_format = fmt
                            break
                cell_format = self.get_fmt(num_format, alignment, shaded)
                sheet.write(row, col, value, cell_format)

        # Return the resulting data structures
        return sheet, (row, col)

    def add_buttons(self) -> None:
        """
        **Create buttons to navigate to data sheets.**

        Sheet buttons are created on the 'Report' sheet as text boxes with
        clickable links to the various data sheets.
        """

        worksheets = self.worksheets().copy()
        report_sheet = worksheets.pop(0)
        head_spc = 35
        button_h = 40
        button_w = 180
        button_spc = 20
        lr_margin = 25

        report_sheet.insert_textbox(
            'D2', 'Go to sheet',
            {
                'width': button_w + 2 * lr_margin,
                'height': len(worksheets) * (button_h + button_spc) + head_spc,
                'y_offset': 10,
                'object_position': 3,
                'line': {'color': 'black'},
                'fill': {'color': 'white'},
                'align': {'horizontal': 'left'},
                'font': {'bold': True}
            }
        )
        for num, ws in enumerate(worksheets):
            report_sheet.insert_textbox(
                'D2', ws.name,
                {
                    'width': button_w,
                    'height': button_h,
                    'x_offset': lr_margin + 3,
                    'y_offset': 10 + head_spc + num * (button_h + button_spc),
                    'object_position': 3,
                    'line': {'color': report_conf.colour_btn_brdr, 'width': 1},
                    'fill': {'color': report_conf.colour_btn_fill},
                    'align': {'horizontal': 'center', 'vertical': 'middle'},
                    'font': {'color': report_conf.colour_btn_text,
                             'size': 12, 'bold': True},
                    'tip': 'press alt-left to return to the last visited sheet',
                    'url': f"internal:'{ws.name}'!A2"
                }
            )

    def close_and_publish(self) -> None:
        """
        **Call finalisation code, close file and publish it.**

        Publishing means copying the report to the configured publish
        directory (ref. `bd_www.Config`). In case this directory does not
        exist or is unreachable, an error will be logged but processing
        continues.
        """
        publ_dir = Path(report_conf.publ_dir)
        rep_file: Path = self.filename
        publ_file = publ_dir / rep_file.name
        super().close()
        try:
            shutil.copy(rep_file, publ_file)
        except FileNotFoundError:
            logger.error('Publishing the report was not successful')

Workbook subclass for writing site reports.

Instantiating from this class creates a site report (workbook) with one sheet named 'Report'. This sheet contains the info that was given when instantiating, together with release notes of the site report.

Instance methods:

Instance attributes:

  • rep_fmts: format library to be used when writing data to report sheets using the add_datasheet method; this library can be accessed using the get_fmt instance method

The formats in the rep_fmts library can be accessed via three-level indexing with respective values for num_format, alignment and shading. While shading is boolean, the potential values for the num_format and alignment are given below, and can be used as values for the cols column specification in the report configuration file.

Values for num_format

  • 'std': 'General' format according to the spreadsheet application
  • 'id': plain integer
  • 'int': integer with thousands separators
  • 'int-delta': +/- signed integer with thousands separators
  • 'flt3': float with three decimals
  • 'flt3-delta': +/- signed float with three decimals
  • 'pct0': percentage without decimals
  • 'pct2': percentage with two decimals
  • 'hdr': format for cells with column headers

Two additional formats can be used that depend on the type of value to be written in a cell:

  • 'int/flt3': 'int' for integer, 'flt3' for float
  • 'int-delta/flt3-delta': 'int-delta' for integer, 'flt3-delta' for float

Values for alignment

  • 'std': default alignment according to the spreadsheet application
  • 'left': left aligned
  • 'center': center aligned
  • 'right': right aligned
#   ReportWorkbook(wb_path: pathlib.Path, rep_info: list[tuple])
View Source
    def __init__(self, wb_path: Path, rep_info: list[tuple]) -> None:
        """
        **Create and initialise a report workbook.**

        Arguments:

            wb_path: path of the report workbook to be created
            rep_info: name/values pairs to be written to the 'Report' sheet

        Returns:

            instance of ReportWorkbook

        A new workbook is created with one sheet named 'Report', containing
        report information given via the `rep_info` parameter, and release
        notes.
        """
        super().__init__(wb_path, {'constant_memory': True})

        # Store all potential formats to write the sheets
        self.rep_fmts = {}
        f0 = {'border_color': report_conf.colour_brdr, 'left': 1, 'right': 1}
        for num_fmt in ('std', 'id', 'int', 'int-delta', 'flt3',
                        'flt3-delta', 'pct0', 'pct2', 'hdr'):
            self.rep_fmts[num_fmt] = {}
            f1 = {}
            match num_fmt:
                case 'std':
                    f1 = {'num_format': 'General'}
                case 'id':
                    f1 = {'num_format': '0'}
                case 'int':
                    f1 = {'num_format': '#,##0'}
                case 'int-delta':
                    f1 = {'num_format': '+#,##0;-#,##0;-'}
                case 'flt3':
                    f1 = {'num_format': '0.000'}
                case 'flt3-delta':
                    f1 = {'num_format': '0.000'}
                case 'pct0':
                    f1 = {'num_format': '0%'}
                case 'pct2':
                    f1 = {'num_format': '0.00%'}
                case 'hdr':
                    f1 = {'bold': True, 'font_color': 'white',
                          'fg_color': report_conf.colour_hdr_bg,
                          'border_color': 'white'}
            for align in ('std', 'left', 'center', 'right'):
                self.rep_fmts[num_fmt][align] = {}
                f2 = {} if align == 'std' else {'align': align}
                for shade in (False, True):
                    f3 = {'fg_color': report_conf.colour_shade} if shade else {}
                    self.rep_fmts[num_fmt][align][shade] = \
                        self.add_format(f0 | f1 | f2 | f3)

        # Insert 'Report' worksheet and add report information and release notes
        ws, (row, col) = self.add_datasheet('Report', rep_info,
                                            shading='number')
        max_length = max([len(n) for n in _REP_REL_NOTES])
        ws.insert_textbox(
            row + 3, 0, '\n'.join(_REP_REL_NOTES),
            {'x_offset': 20,
             'object_position': 3,
             'line': {'color': report_conf.colour_hdr_bg, 'width': 1},
             'fill': {'color': report_conf.colour_shade},
             'width': max_length * 6.4,
             'height': len(_REP_REL_NOTES) * 18 + 30,
             'align': {'vertical': 'middle'}}
        )

Create and initialise a report workbook.

Arguments:

wb_path: path of the report workbook to be created
rep_info: name/values pairs to be written to the 'Report' sheet

Returns:

instance of ReportWorkbook

A new workbook is created with one sheet named 'Report', containing report information given via the rep_info parameter, and release notes.

#   def get_fmt( self, num_format: str, alignment: str = 'std', shaded: bool = False ) -> xlsxwriter.format.Format:
View Source
    def get_fmt(self, num_format: str, alignment: str = 'std',
                shaded: bool = False) -> xlsxwriter.workbook.Format:
        """
        **Get format to write report cells.**

        Arguments:

            num_format: specification of the required number format
            alignment: specification of the required alignment
            shaded: True if the cell should have a slightly darker background

        Returns:

            format to be used when writing worksheet cells

        Potential values of the `num_format` and `alignment` parameters are
        documented in the `ReportWorkbook` class.
        """
        return self.rep_fmts[num_format][alignment][shaded]

Get format to write report cells.

Arguments:

num_format: specification of the required number format
alignment: specification of the required alignment
shaded: True if the cell should have a slightly darker background

Returns:

format to be used when writing worksheet cells

Potential values of the num_format and alignment parameters are documented in the ReportWorkbook class.

#   def add_datasheet( self, sh_name: str, data: list[typing.Union[tuple, list]], incl_txt_fb: bool = True, shading: str = None, cluster_col: int = 0, cluster_del: bool = False ) -> tuple[xlsxwriter.worksheet.Worksheet, tuple[int, int]]:
View Source
    def add_datasheet(self, sh_name: str, data: list[Union[tuple, list]],
                      incl_txt_fb: bool = True, shading: str = None,
                      cluster_col: int = 0, cluster_del: bool = False) \
            -> tuple[xlsxwriter.workbook.Worksheet, tuple[int, int]]:
        """
        **Add a data sheet to the report workbook.**

        Arguments:

            sh_name: name of the sheet
            data: two-dimensional data structure with rows of values
            incl_txt_fb: if False, columns with textual feedback will be skipped
            shading: method to shade rows (None, 'number' or 'cluster')
            cluster_col: zero indexed column number of the value used for
                shading
            cluster_del: do not write the cluster value to the sheet

        Returns:

            reference to the new worksheet and the last (row, col) position

        The configuration for the sheet is read from the report configuration
        file. The `sh_name` with which the sheet is created has to be present
        as first level key value in this configuration file.

        When `shading` is specified as 'number', the rows will get toggled
        background shading after one to three rows (depends on the total
        number of rows) for better readability. When `shading` is 'cluster',
        shading of rows will be toggled after the value in column
        `cluster_col` changes. This value will not be written to the sheet if
        `cluster_del` is True. As per default the sheet rows will have no
        shading.
        """

        # Get the specifications for this sheet
        try:
            with open(report_conf.rep_config_json) as f:
                sheet_spec = json.load(f)[sh_name]
        except KeyError:
            raise ValueError(
                f'no specifications available in {report_conf.rep_config_json} '
                f'for sheet named {sh_name}')

        col_specs = [s for s in sheet_spec['cols']
                     if incl_txt_fb or 'feedback txt' not in s[0]]

        # Create and configure the sheet
        sheet = self.add_worksheet(sh_name)
        for col, col_spec in enumerate(col_specs):
            sheet.write(0, col, col_spec[0], self.get_fmt('hdr'))
            sheet.set_column(col, col, col_spec[1])
        if sheet_spec['frozen_rows'] or sheet_spec['frozen_cols']:
            # With both zero, xlsxwriter delivers a partly corrupt xlsx
            sheet.freeze_panes(
                sheet_spec['frozen_rows'], sheet_spec['frozen_cols'])
        if sheet_spec['show_grid']:
            sheet.hide_gridlines(0)
        else:
            sheet.hide_gridlines(2)
        if sheet_spec['autofilter']:
            sheet.autofilter(0, 0, len(data), len(col_specs) - 1)

        # Write the data
        shaded = False
        shade_num = 0
        last_cluster = ''
        if shading == 'number':
            # Shading toggles any shade_num rows
            if len(data) <= 10:
                shade_num = 1
            elif len(data) < 20:
                shade_num = 2
            else:
                shade_num = 3
        row, col = 1, 0  # Initialize to avoid inspection warning
        for row, values in enumerate(data, start=1):
            if shading == 'number':
                # Toggle shading after `shade_num` rows
                shaded = row % (2 * shade_num) not in list(
                    range(1, shade_num + 1))
            elif shading == 'cluster':
                # Toggle shading after value of `cluster` changes
                if cluster_del:
                    values = list(values)
                    cluster = values.pop(cluster_col)
                else:
                    cluster = values[cluster_col]
                if not last_cluster:
                    last_cluster = cluster
                if cluster != last_cluster:
                    shaded = not shaded
                    last_cluster = cluster
            for col, value in enumerate(values):
                num_format, alignment = col_specs[col][2:4]
                if '/' in num_format:
                    for fmt in num_format.split('/'):
                        if ('flt' in fmt and type(value) == float) or \
                                ('int' in fmt and type(value) in (int, str)):
                            num_format = fmt
                            break
                cell_format = self.get_fmt(num_format, alignment, shaded)
                sheet.write(row, col, value, cell_format)

        # Return the resulting data structures
        return sheet, (row, col)

Add a data sheet to the report workbook.

Arguments:

sh_name: name of the sheet
data: two-dimensional data structure with rows of values
incl_txt_fb: if False, columns with textual feedback will be skipped
shading: method to shade rows (None, 'number' or 'cluster')
cluster_col: zero indexed column number of the value used for
    shading
cluster_del: do not write the cluster value to the sheet

Returns:

reference to the new worksheet and the last (row, col) position

The configuration for the sheet is read from the report configuration file. The sh_name with which the sheet is created has to be present as first level key value in this configuration file.

When shading is specified as 'number', the rows will get toggled background shading after one to three rows (depends on the total number of rows) for better readability. When shading is 'cluster', shading of rows will be toggled after the value in column cluster_col changes. This value will not be written to the sheet if cluster_del is True. As per default the sheet rows will have no shading.

#   def add_buttons(self) -> None:
View Source
    def add_buttons(self) -> None:
        """
        **Create buttons to navigate to data sheets.**

        Sheet buttons are created on the 'Report' sheet as text boxes with
        clickable links to the various data sheets.
        """

        worksheets = self.worksheets().copy()
        report_sheet = worksheets.pop(0)
        head_spc = 35
        button_h = 40
        button_w = 180
        button_spc = 20
        lr_margin = 25

        report_sheet.insert_textbox(
            'D2', 'Go to sheet',
            {
                'width': button_w + 2 * lr_margin,
                'height': len(worksheets) * (button_h + button_spc) + head_spc,
                'y_offset': 10,
                'object_position': 3,
                'line': {'color': 'black'},
                'fill': {'color': 'white'},
                'align': {'horizontal': 'left'},
                'font': {'bold': True}
            }
        )
        for num, ws in enumerate(worksheets):
            report_sheet.insert_textbox(
                'D2', ws.name,
                {
                    'width': button_w,
                    'height': button_h,
                    'x_offset': lr_margin + 3,
                    'y_offset': 10 + head_spc + num * (button_h + button_spc),
                    'object_position': 3,
                    'line': {'color': report_conf.colour_btn_brdr, 'width': 1},
                    'fill': {'color': report_conf.colour_btn_fill},
                    'align': {'horizontal': 'center', 'vertical': 'middle'},
                    'font': {'color': report_conf.colour_btn_text,
                             'size': 12, 'bold': True},
                    'tip': 'press alt-left to return to the last visited sheet',
                    'url': f"internal:'{ws.name}'!A2"
                }
            )

Create buttons to navigate to data sheets.

Sheet buttons are created on the 'Report' sheet as text boxes with clickable links to the various data sheets.

#   def close_and_publish(self) -> None:
View Source
    def close_and_publish(self) -> None:
        """
        **Call finalisation code, close file and publish it.**

        Publishing means copying the report to the configured publish
        directory (ref. `bd_www.Config`). In case this directory does not
        exist or is unreachable, an error will be logged but processing
        continues.
        """
        publ_dir = Path(report_conf.publ_dir)
        rep_file: Path = self.filename
        publ_file = publ_dir / rep_file.name
        super().close()
        try:
            shutil.copy(rep_file, publ_file)
        except FileNotFoundError:
            logger.error('Publishing the report was not successful')

Call finalisation code, close file and publish it.

Publishing means copying the report to the configured publish directory (ref. bd_www.Config). In case this directory does not exist or is unreachable, an error will be logged but processing continues.

Inherited Members
xlsxwriter.workbook.Workbook
chartsheet_class
worksheet_class
add_worksheet
add_chartsheet
add_format
add_chart
add_vba_project
close
set_size
set_tab_ratio
set_properties
set_custom_property
set_calc_mode
define_name
worksheets
get_worksheet_by_name
get_default_url_format
use_zip64
set_vba_name
#   def site_report( ts: str, report_set: str = 'std', full_info: bool = True, publish: bool = False, log_to_file: bool = True ) -> None:
View Source
def site_report(ts: str,
                report_set: str = 'std',
                full_info: bool = True,
                publish: bool = False,
                log_to_file: bool = True) -> None:
    """
    **Generate site reports.**

    Arguments:

        ts: timestamp of the scrape [yymmdd-hhmm] that defines the site status
        report_set: defines the types of reports to be generated ['min',
            'std' or 'max']
        full_info: add information (sheets) for all pages, links, redirs and
            paths
        publish: copy generated reports to the configured publish location
        log_to_file: log messages will be saved to file as well as to console

    Reports are generated for the site as it was during the scrape with
    timestamp `ts`. Each report is referenced to a scrape at a different
    logical (calendar) period earlier. Potential reports that will be
    generated are:

    - day: report covering one day without reference to another scrape
    - daily: report with reference to a scrape one day before
    - weekly: report with reference to a scrape one week before
    - one-monthly: report with reference to a scrape one month before
    - monthly: one-monthly report starting on the first day of the month
    - three-monthly: report starting on the first day of the month with
      reference to a scrape three months before
    - quarterly: three-monthly report starting on the first day of January,
      April, July or October
    - six-monthly: report starting on the first day of the month with reference
      to a scrape six months before
    - half-yearly: six-monthly report starting on the first day of January
      or July
    - twelve-monthly: report starting on the first day of the month with
      reference to a scrape twelve months before
    - yearly: twelve-monthly report starting on January 1st

    The actual value of the `report_set` parameter limits the reports that
    will be generated:

    - 'min': weekly on monday, (one-)monthly on first-day-of-month
    - 'std': all daily's, weekly on monday, all monthly's on first-day-of-month,
      potentially resulting also in quarterly, half-yearly and/or yearly
    - 'max': all possible (use with care!)
    - one value from the available report types ('daily', 'weekly', etc.)

    Only those reports will be generated for which the right reference scrape
    is available. If no earlier scrape exists for any of the logical calendar
    periods before, reporting falls back to a day report only (so without
    reference to any other scrape), but only if no specific type was requested.

    **Note:**
    *The actual workhorse to generate the necessary reports is the
    `report_period` function.*
    """

    # Setup logging
    global logger
    logger = logging.getLogger('report')
    logger.setLevel(logging.INFO)
    log_file = str(mst_dir / report_conf.log_name)
    formatter = logging.Formatter(
        fmt='[%(asctime)s] %(levelname)-8s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    if log_to_file:
        fh = logging.FileHandler(log_file)
        fh.setLevel(logging.INFO)
        fh.setFormatter(formatter)
        logger.addHandler(fh)

    def months_back(date: dt.date, months: int) -> dt.date:
        """
        **Shift a date some months back.**

        Arguments:

            date: reference date
            months: number of months to shift

        Returns:

            reference date minus the number of months
        """
        eom = date
        for i in range(months):
            eom = eom.replace(day=1) - one_day
        if eom.day >= date.day:
            return eom.replace(day=date.day)
        else:
            return eom

    valid_types = set(REPORT_SUBDIRS.keys())
    try:
        valid_types.remove('day')
    except KeyError:
        pass

    # Set reporting dates for all potential reports
    report = {}
    one_day = dt.timedelta(days=1)
    scr_date = ts_to_d(ts)
    per_end = scr_date - one_day
    #
    per_start = per_end
    ref_end = ref_start = per_start - one_day
    report['daily'] = [per_start, per_end, ref_start, ref_end]
    #
    per_start = scr_date - dt.timedelta(days=7)
    ref_end = per_start - one_day
    ref_start = per_start - dt.timedelta(days=7)
    report['weekly'] = [per_start, per_end, ref_start, ref_end]
    #
    if scr_date.day == 1:
        per_start = (scr_date - dt.timedelta(days=28)).replace(day=1)
        ref_end = per_start - one_day
        ref_start = (per_start - dt.timedelta(days=28)).replace(day=1)
        report['monthly'] = [per_start, per_end, ref_start, ref_end]
    else:
        per_start = months_back(scr_date, 1)
        ref_start = months_back(scr_date, 2)
        ref_end = per_start - one_day
        report['one-monthly'] = [per_start, per_end, ref_start, ref_end]
    #
    if scr_date.day == 1 and scr_date.month % 3 == 1:
        per_start = (scr_date - dt.timedelta(days=90)).replace(day=1)
        ref_end = per_start - one_day
        ref_start = (per_start - dt.timedelta(days=90)).replace(day=1)
        report['quarterly'] = [per_start, per_end, ref_start, ref_end]
    else:
        per_start = months_back(scr_date, 3)
        ref_start = months_back(scr_date, 6)
        ref_end = per_start - one_day
        report['three-monthly'] = [per_start, per_end, ref_start, ref_end]
    #
    if scr_date.day == 1 and scr_date.month % 6 == 1:
        per_start = (scr_date - dt.timedelta(days=181)).replace(day=1)
        ref_end = per_start - one_day
        ref_start = (per_start - dt.timedelta(days=181)).replace(day=1)
        report['half-yearly'] = [per_start, per_end, ref_start, ref_end]
    else:
        per_start = months_back(scr_date, 6)
        ref_start = months_back(scr_date, 12)
        ref_end = per_start - one_day
        report['six-monthly'] = [per_start, per_end, ref_start, ref_end]
    #
    if scr_date.day == 1 and scr_date.month == 1:
        per_start = (scr_date - dt.timedelta(days=365)).replace(day=1)
        ref_end = per_start - one_day
        ref_start = (per_start - dt.timedelta(days=365)).replace(day=1)
        report['yearly'] = [per_start, per_end, ref_start, ref_end]
    else:
        per_start = months_back(scr_date, 12)
        ref_start = months_back(scr_date, 24)
        ref_end = per_start - one_day
        report['twelve-monthly'] = [per_start, per_end, ref_start, ref_end]

    # Drop potential reports that are not requested
    for per_type in list(report.keys()):
        per_start, per_end, ref_start, ref_end = report[per_type]
        match report_set:
            case 'min':
                match per_type:
                    case 'weekly':
                        if per_start.isoweekday() != 1:
                            del report[per_type]
                        continue
                    case 'monthly':
                        if per_start.day != 1:
                            del report[per_type]
                        continue
                    case _:
                        del report[per_type]
                        continue
            case 'std':
                match per_type:
                    case 'weekly':
                        if per_start.isoweekday() != 1:
                            del report[per_type]
                        continue
                    case 'daily':
                        continue
                    case _:
                        if per_start.day != 1:
                            del report[per_type]
                        continue
            case 'max':
                continue
            case _:
                if report_set != per_type:
                    del report[per_type]
                continue

    # Form arguments for reports that are covered by available scrapes
    report_args = []
    for per_type in report.keys():
        per_start, per_end, ref_start, ref_end = report[per_type]
        if qry_result := mst_conn.execute(
                'SELECT timestamp FROM mst_scrapes WHERE date = ?',
                [per_start]).fetchone():
            ref_ts = qry_result[0]
            prev_ref_ts = mst_conn.execute(
                'SELECT min(timestamp) FROM mst_scrapes WHERE date >= ?',
                [ref_start]).fetchone()[0]
            if ref_start != ts_to_d(prev_ref_ts):
                # Invalid reference period
                prev_ref_ts = ref_ts
            report_args.append([ts, ref_ts, prev_ref_ts, per_type])

    # Handle case where no potential report remains
    if not report_args:
        if report_set in valid_types:
            msg = f'no {report_set} report produced for scrape {ts}'
            logger.info(msg)
            print(msg)
            return
        else:
            msg = f'no reference scrape available for scrape {ts}: ' \
                  f'only a day report will be produced'
            logger.info(msg)
            print(msg)
            report_args = [(ts, ts, None, 'day')]

    if report_set not in {'min', 'std', 'max'} | valid_types:
        # Report set is not valid; create day report and log warning
        logger.warning(f'illegal report set specified ({report_set}): '
                       f'no report created for scrape {ts}')
    else:
        # Create the reports
        logger.info(f'reporting for scrape {ts} started')
        for args in report_args:
            for report in report_conf.reports:
                report_conf.spec_report(report)
                report_period(*args, full_info=full_info, publish=publish)
        logger.info(f'reporting for scrape {ts} finished')

Generate site reports.

Arguments:

ts: timestamp of the scrape [yymmdd-hhmm] that defines the site status
report_set: defines the types of reports to be generated ['min',
    'std' or 'max']
full_info: add information (sheets) for all pages, links, redirs and
    paths
publish: copy generated reports to the configured publish location
log_to_file: log messages will be saved to file as well as to console

Reports are generated for the site as it was during the scrape with timestamp ts. Each report is referenced to a scrape at a different logical (calendar) period earlier. Potential reports that will be generated are:

  • day: report covering one day without reference to another scrape
  • daily: report with reference to a scrape one day before
  • weekly: report with reference to a scrape one week before
  • one-monthly: report with reference to a scrape one month before
  • monthly: one-monthly report starting on the first day of the month
  • three-monthly: report starting on the first day of the month with reference to a scrape three months before
  • quarterly: three-monthly report starting on the first day of January, April, July or October
  • six-monthly: report starting on the first day of the month with reference to a scrape six months before
  • half-yearly: six-monthly report starting on the first day of January or July
  • twelve-monthly: report starting on the first day of the month with reference to a scrape twelve months before
  • yearly: twelve-monthly report starting on January 1st

The actual value of the report_set parameter limits the reports that will be generated:

  • 'min': weekly on monday, (one-)monthly on first-day-of-month
  • 'std': all daily's, weekly on monday, all monthly's on first-day-of-month, potentially resulting also in quarterly, half-yearly and/or yearly
  • 'max': all possible (use with care!)
  • one value from the available report types ('daily', 'weekly', etc.)

Only those reports will be generated for which the right reference scrape is available. If no earlier scrape exists for any of the logical calendar periods before, reporting falls back to a day report only (so without reference to any other scrape), but only if no specific type was requested.

Note: The actual workhorse to generate the necessary reports is the report_period function.

#   def ts_to_d(ts: str, delta_days: int = 0) -> datetime.date:
View Source
def ts_to_d(ts: str, delta_days: int = 0) -> dt.date:
    """
    **Convert timestamp to date with potential day shift.**

    Arguments:

        ts: timestamp with format yymmdd-hhmm
        delta_days: number of days to add

    Returns:

        converted timestamp

    The returned date is shifted with `delta_days`.

    Validity of the timestamp is not checked.
    """

    y = ts[0:2]
    m = ts[2:4]
    d = ts[4:6]
    year = int(y) if y.isdecimal() else 20
    month = int(m) if m.isdecimal() else 1
    day = int(d) if d.isdecimal() else 1
    return dt.date(2000 + year, month, day) + dt.timedelta(days=delta_days)

Convert timestamp to date with potential day shift.

Arguments:

ts: timestamp with format yymmdd-hhmm
delta_days: number of days to add

Returns:

converted timestamp

The returned date is shifted with delta_days.

Validity of the timestamp is not checked.

#   def report_period( rep_ts: str, ref_ts: str, pre_ts: str = None, rep_type: str = 'day', full_info: bool = True, publish: bool = False ) -> None:
View Source
def report_period(rep_ts: str,
                  ref_ts: str,
                  pre_ts: str = None,
                  rep_type: str = 'day',
                  full_info: bool = True,
                  publish: bool = False) -> None:
    """
    **Write an Excel report for a given period.**

    Arguments:

        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period
        pre_ts: timestamp of the scrape that marks the (inclusive) start of the
            reference period
        rep_type: textual characterisation of the reporting period to be
            part of the report name
        full_info: add sheets with pages, links, downloads, redirs and paths
        publish: copy generated report to the configured publish location

    The report will be written as an Excel workbook to the subdirectory
    according to the `REPORT_SUBDIRS` constant within the configured report
    directory. A report with the same name will be overwritten.

    Apart from the sources of the pages, most of the contents of the scrape
    at the reporting timestamp `rep_ts` is comprised in the report.
    Additionally, the report contains page metrics and page feedbacks for the
    period, and the essential differences for all the site and each new,
    removed and changed page relative to the scrape at the reference
    timestamp `ref_ts`.

    In case the reference timestamp `ref_ts` equals the reporting timestamp
    `rep_ts`, a 'day' report will be generated, which will not contain
    reference to another scrape.

    The pre-reference timestamp `pre_ts` marks the start of the reference
    period, which is used to compare some period dependent (*dynamic*) key
    figures with the previous period. In case `pre_ts` is `None` (default) or
    equal to the reference timestamp `ref_ts` (leading to a reference period
    of one day), key figures that are dependent on the period length will not
    be referenced to another period.
    """

    # Set (inclusive) dates for the reporting period
    rep_end = ts_to_d(rep_ts, -1)
    rep_start = ts_to_d(ref_ts) if ref_ts != rep_ts else rep_end

    # Initiate the report workbook
    rep_name = report_conf.report_name
    report_dir = mst_dir / report_conf.report_dir_name
    xlsx_name = f'{rep_ts} - {rep_type} {rep_name}.xlsx'
    xlsx_path = report_dir / REPORT_SUBDIRS[rep_type] / xlsx_name
    if ref_ts != rep_ts:
        per_str = rep_start.strftime('%b %d, %Y') + ' / '
    else:
        per_str = 'single day: '
    report_info: list[tuple] = [
        ('Report period', per_str + rep_end.strftime('%b %d, %Y')),
        ('Timestamp status scrape', rep_ts),
        ('Timestamp reference scrape', ref_ts if ref_ts != rep_ts else 'None'),
        ('Report creation', time.strftime('%b %d, %Y %H:%M')),
        ('Report version', REPORT_VERSION),
        ('Root URL', ROOT_URL)
    ]
    wb = ReportWorkbook(xlsx_path, report_info)
    keyfigs_sheet(wb, rep_ts, ref_ts, pre_ts)
    data_legend_sheet(wb)

    # Add sheets with reference data
    if ref_ts != rep_ts:
        rem_pages_sheet(wb, rep_ts, ref_ts, report_conf.incl_fb)
        new_pages_sheet(wb, rep_ts, ref_ts, report_conf.incl_fb)
        changed_aspects_sheet(wb, rep_ts, ref_ts)

    # Add sheets with scrape details
    if full_info:
        all_pages_sheet(wb, rep_ts, ref_ts, report_conf.incl_fb)
        if report_conf.incl_fb:
            all_feedback_sheet(wb, rep_ts, ref_ts)
        ed_links_sheet(wb, rep_ts)
        downloads_sheet(wb, rep_ts, ref_ts)
        redirs_aliases_sheet(wb, rep_ts)
        paths_sheet(wb, rep_ts, ref_ts)

    wb.add_buttons()
    if publish:
        wb.close_and_publish()
        logger.info(f'report generated and published: {xlsx_name}')
    else:
        wb.close()
        logger.info(f'report generated: {xlsx_name}')

Write an Excel report for a given period.

Arguments:

rep_ts: timestamp of the scrape that marks the (exclusive) end of the
    reporting period
ref_ts: timestamp of the scrape that marks the (inclusive) start of the
    reporting period and the (exclusive) end of the reference period
pre_ts: timestamp of the scrape that marks the (inclusive) start of the
    reference period
rep_type: textual characterisation of the reporting period to be
    part of the report name
full_info: add sheets with pages, links, downloads, redirs and paths
publish: copy generated report to the configured publish location

The report will be written as an Excel workbook to the subdirectory according to the REPORT_SUBDIRS constant within the configured report directory. A report with the same name will be overwritten.

Apart from the sources of the pages, most of the contents of the scrape at the reporting timestamp rep_ts is comprised in the report. Additionally, the report contains page metrics and page feedbacks for the period, and the essential differences for all the site and each new, removed and changed page relative to the scrape at the reference timestamp ref_ts.

In case the reference timestamp ref_ts equals the reporting timestamp rep_ts, a 'day' report will be generated, which will not contain reference to another scrape.

The pre-reference timestamp pre_ts marks the start of the reference period, which is used to compare some period dependent (dynamic) key figures with the previous period. In case pre_ts is None (default) or equal to the reference timestamp ref_ts (leading to a reference period of one day), key figures that are dependent on the period length will not be referenced to another period.

#   def keyfigs_sheet( wb: bd_www.report.ReportWorkbook, rep_ts: str, ref_ts: str, pre_ts: str ) -> None:
View Source
def keyfigs_sheet(wb: ReportWorkbook,
                  rep_ts: str, ref_ts: str, pre_ts: str) -> None:
    """
    **Add a report sheet with key figures.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period
        pre_ts: timestamp of the scrape that marks the (inclusive) start of the
            reference period

    The sheet will contain two types of key figures, which can only implicitly
    be distinguished from each other:

    - figures describing the site *status* at the end of the reporting period
    - figures representing the *dynamics* of the site over the reporting period

    The first (*static*) type is readily available for all timestamps in the
    *his_status_figures* table of the scrape master database. The second
    (*dynamic*) type, consisting of key figures for metrics, feedback and
    editorial text modifications, is calculated by this function.

    In case the reference start timestamp `pre_ts` is `None` or equal to the
    reference end timestamp `ref_ts`, *dynamic* key figures will not be
    referenced.
    """

    # Set (inclusive) dates for the reporting and reference period
    rep_end = ts_to_d(rep_ts, -1)
    rep_start = ts_to_d(ref_ts) if ref_ts != rep_ts else rep_end

    # Gather the key figures to report:
    # - get the dynamic ones
    rep_kfigs = metric_keyfigs(rep_ts, rep_start, rep_end)
    rep_kfigs |= feedback_keyfigs(rep_ts, rep_start, rep_end)
    rep_kfigs |= modpages_keyfigs(rep_ts, ref_ts)
    rep_kfigs |= rempages_keyfigs(rep_ts, ref_ts)
    rep_kfigs |= newpages_keyfigs(rep_ts, ref_ts)
    # - store their names for later use
    rep_dyn_kf_names = [k for k in rep_kfigs.keys()]
    # - add the static ones
    rep_kfigs |= status_keyfigs(rep_ts)

    # Prepare the data for the sheet
    kf_details = keyfig_details()
    sheet_values = []
    if ref_ts != rep_ts:
        # Add reference values to compare the report values with
        ref_kfigs = status_keyfigs(ref_ts)
        if pre_ts and pre_ts != ref_ts:
            # Get the reference values for the dynamic key figures
            ref_end = ts_to_d(ref_ts, -1)
            ref_start = ts_to_d(pre_ts)
            ref_kfigs |= metric_keyfigs(ref_ts, ref_start, ref_end)
            ref_kfigs |= feedback_keyfigs(ref_ts, ref_start, ref_end)
            ref_kfigs |= rempages_keyfigs(ref_ts, pre_ts)
            ref_kfigs |= newpages_keyfigs(ref_ts, pre_ts)
            ref_kfigs |= modpages_keyfigs(ref_ts, pre_ts)
        else:
            # The reference period is not known, so add 'x' references for
            # the dynamic key figures.
            for kf_name in rep_dyn_kf_names:
                # noinspection PyTypeChecker
                ref_kfigs[kf_name] = 'x'
        # Calculate delta between reporting and reference values and add key
        # figure details.
        for kf_name in set(rep_kfigs.keys()) | set(ref_kfigs.keys()):
            seq_nr, cluster, description = kf_details.get(kf_name, [0, '', ''])
            rep_val = rep_kfigs.get(kf_name, 0)
            if rep_val == 'x':
                rep_val = ''
                delta = ''
            else:
                ref_val = ref_kfigs.get(kf_name, 0)
                delta = '' if ref_val == 'x' else rep_val - ref_val
            sheet_values.append(
                [seq_nr, cluster, description, kf_name, rep_val, delta]
            )
    else:
        # Add key figure details
        for kf_name in rep_kfigs.keys():
            seq_nr, cluster, description = kf_details.get(kf_name, [0, '', ''])
            rep_val = rep_kfigs.get(kf_name, 0)
            if rep_val == 'x':
                rep_val = ''
            sheet_values.append(
                [seq_nr, cluster, description, kf_name, rep_val]
            )

    # Sort on sequence number and remove this number from the data
    sheet_values.sort()
    for row in sheet_values:
        del row[0]

    # Write the data
    wb.add_datasheet('Key figures', sheet_values,
                     shading='cluster', cluster_del=True)

Add a report sheet with key figures.

Arguments:

wb: workbook in which the new worksheet is created
rep_ts: timestamp of the scrape that marks the (exclusive) end of the
    reporting period
ref_ts: timestamp of the scrape that marks the (inclusive) start of the
    reporting period and the (exclusive) end of the reference period
pre_ts: timestamp of the scrape that marks the (inclusive) start of the
    reference period

The sheet will contain two types of key figures, which can only implicitly be distinguished from each other:

  • figures describing the site status at the end of the reporting period
  • figures representing the dynamics of the site over the reporting period

The first (static) type is readily available for all timestamps in the his_status_figures table of the scrape master database. The second (dynamic) type, consisting of key figures for metrics, feedback and editorial text modifications, is calculated by this function.

In case the reference start timestamp pre_ts is None or equal to the reference end timestamp ref_ts, dynamic key figures will not be referenced.

#   def status_keyfigs(ts: str) -> dict[str, int]:
View Source
def status_keyfigs(ts: str) -> dict[str, int]:
    """
    **Get the status figures of a scrape.**

    Arguments:

        ts: timestamp of the scrape [yymmdd-hhmm]

    Returns:

        name, value dictionary of the key figures

    Just returns the name/value pairs from the *his_status_figures* table
    with timestamp `ts`.
    """

    qry = f'''
        SELECT name, value
        FROM his_status_figures
        WHERE timestamp = '{ts}' '''
    return {name: value for name, value in mst_conn.execute(qry).fetchall()}

Get the status figures of a scrape.

Arguments:

ts: timestamp of the scrape [yymmdd-hhmm]

Returns:

name, value dictionary of the key figures

Just returns the name/value pairs from the his_status_figures table with timestamp ts.

#   def metric_keyfigs( ts: str, first_date: datetime.date, last_date: datetime.date ) -> dict[str, int]:
View Source
def metric_keyfigs(
        ts: str, first_date: dt.date, last_date: dt.date) -> dict[str, int]:
    """
    **Get metrics key figures for the period.**

    Arguments:

        ts: timestamp of the scrape [yymmdd-hhmm] that defines the (exclusive)
            end of the period
        first_date: first day of the period
        last_date: last day of the period

    Returns:

        name, value dictionary of the key figures

    Next key figures are returned:

    - views_total: pageviews of all pages
    - pages_visits_low: number of pages with less than 100 visits
    - pages_visits_xlow: number of pages with less than 10 visits
    - pages_visits_no: number of pages with no visits
    - views_lang_`<language>`: pageviews per *language*
    - views_buss_`<business>`: pageviews per *business*
    - views_cat_`<category>`: pageviews per *category*

    In case the `days` table of the metrics master database does not cover
    all days of the period, the key figures will be returned with a (textual)
    value of 'x'.
    """

    per_days = (last_date - first_date).days + 1
    if per_days < 1:
        raise ValueError('invalid period length')
    per_metrics(first_date, last_date)
    qry = f'''
        WITH
            -- All days in period
            days AS (
                SELECT date_id
                FROM dates
                WHERE date >= '{first_date}' AND date <= '{last_date}'
            ),
            -- Check if metrics are available for all days in period 
            per_chk AS (
                SELECT count(*) = {per_days} AS ok
                FROM days
            ),
            -- Last known language of all pages until timestamp
            page_lang AS (
                SELECT DISTINCT
                    page_id,
                    last_value(language) OVER (
                        PARTITION BY page_id
                        ORDER BY timestamp
                        ROWS BETWEEN UNBOUNDED PRECEDING
                            AND UNBOUNDED FOLLOWING
                    ) AS language
                FROM his_pages_info
                WHERE timestamp <= '{ts}'            
            ),
            -- Last known business of all pages until timestamp
            page_buss AS (
            SELECT DISTINCT
                page_id,
                last_value(business) OVER (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING
                        AND UNBOUNDED FOLLOWING
                ) AS business
                FROM his_pages_info
                WHERE timestamp <= '{ts}'
            ),
            -- Last known category of all pages until timestamp
            page_cat AS (
            SELECT DISTINCT
                page_id,
                last_value(category) OVER (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING
                        AND UNBOUNDED FOLLOWING
                ) AS category
                FROM his_pages_info
                WHERE timestamp <= '{ts}'
            ),
            metric_keyfigs (name, value) AS (
                SELECT 'views_total', iif(per_chk.ok, sum(views), 'x')
                FROM per_metrics, per_chk
                    UNION
                SELECT 'pages_visits_low', iif(per_chk.ok, count(*), 'x')
                FROM per_metrics, per_chk
                WHERE visits < 100
                    UNION
                SELECT 'pages_visits_xlow', iif(per_chk.ok, count(*), 'x')
                FROM per_metrics, per_chk
                WHERE visits < 10
                    UNION
                SELECT 'pages_visits_no', iif(per_chk.ok, count(*), 'x')
                FROM per_metrics, per_chk
                WHERE visits = 0
                    UNION
                SELECT
                    'views_lang_' || ifnull(language, 'None'),
                    iif(per_chk.ok, sum(views), 'x')
                FROM per_metrics, per_chk
                LEFT JOIN page_lang USING (page_id)
                GROUP BY language
                    UNION
                SELECT
                    'views_buss_' || ifnull(business, 'None'),
                    iif(per_chk.ok, sum(views), 'x')
                FROM per_metrics, per_chk
                LEFT JOIN page_buss USING (page_id)
                GROUP BY business
                    UNION
                SELECT
                    'views_cat_' || ifnull(category, 'None'),
                    iif(per_chk.ok, sum(views), 'x')
                FROM per_metrics, per_chk
                LEFT JOIN page_cat USING (page_id)
                GROUP BY category
            )
        SELECT name, value
        FROM metric_keyfigs'''
    return {name: value for name, value in mst_conn.execute(qry).fetchall()}

Get metrics key figures for the period.

Arguments:

ts: timestamp of the scrape [yymmdd-hhmm] that defines the (exclusive)
    end of the period
first_date: first day of the period
last_date: last day of the period

Returns:

name, value dictionary of the key figures

Next key figures are returned:

  • views_total: pageviews of all pages
  • pages_visits_low: number of pages with less than 100 visits
  • pages_visits_xlow: number of pages with less than 10 visits
  • pages_visits_no: number of pages with no visits
  • views_lang_<language>: pageviews per language
  • views_buss_<business>: pageviews per business
  • views_cat_<category>: pageviews per category

In case the days table of the metrics master database does not cover all days of the period, the key figures will be returned with a (textual) value of 'x'.

#   def feedback_keyfigs( ts: str, first_date: datetime.date, last_date: datetime.date ) -> dict[str, int]:
View Source
def feedback_keyfigs(
        ts: str, first_date: dt.date, last_date: dt.date) -> dict[str, int]:
    """
    **Get feedback key figures for the period.**

    Arguments:

        ts: timestamp of the scrape [yymmdd-hhmm] that defines the (exclusive)
            end of the period
        first_date: first day of the period
        last_date: last day of the period

    Returns:

        name, value dictionary of the key figures

    Next positive and negative feedback key figures are returned:

    - feedback_pos_total
    - feedback_neg_total
    - feedback_sat-rate_per
    - feedback_sat-rate_ytd
    - feedback_<pos/neg>_lang_`<language>` per *language*
    - feedback_<pos/neg>_buss_`<business>` per *business*
    - feedback_<pos/neg>_cat_`<category>` per *category*

    In case the `days` table of the metrics master database does not cover
    all days of the period, the key figures will be returned with a (textual)
    value of 'x'.
    """
    per_days = (last_date - first_date).days + 1
    if per_days < 1:
        raise ValueError('invalid period length')
    per_pages(first_date, last_date)
    qry = f'''
        WITH
            -- All days in period
            per_days AS (
                SELECT date_id
                FROM dates
                WHERE date >= '{first_date}' AND date <= '{last_date}'
            ),
            -- All days year-to-date
            ytd_days AS (
                SELECT date_id
                FROM dates
                WHERE date >= substr('{last_date}', 1, 4) || '-01-01'
                    AND date <= '{last_date}'
            ),
            -- Check if metrics are available for all days in period 
            per_chk AS (
                SELECT count(*) =
                        julianday('{last_date}') 
                        - julianday('{first_date}') + 1
                    AS ok
                FROM per_days
            ),
            -- Check if metrics are available for all days year-to-date 
            ytd_chk AS (
                SELECT count(*) = 
                        julianday('{last_date}') 
                        - julianday(substr('{last_date}', 1, 4) || '-01-01') + 1
                    AS ok
                FROM ytd_days
            ),
            -- Period totals per page
            per_feedback AS (
                SELECT
                    page_id,
                    sum(pos_cnt) AS pos_cnt,
                    sum(neg_cnt) AS neg_cnt,
                    group_concat(pos_txt, char(10)) AS pos_txt,
                    group_concat(neg_txt, char(10)) AS neg_txt
                FROM feedback
                WHERE date_id IN per_days
                    -- Exclude pages that were not detected to be alive in the
                    -- period (which happened with earlier scrapes):
                    AND page_id IN per_pages
                GROUP BY page_id
            ),
            -- Year-to-date totals
            ytd_fb_tot AS (
                SELECT
                    sum(pos_cnt) AS pos_cnt,
                    sum(neg_cnt) AS neg_cnt
                FROM feedback
                WHERE date_id IN ytd_days
            ),
            -- Last known language of all pages until timestamp
            page_lang AS (
                SELECT DISTINCT
                    page_id,
                    last_value(language) OVER (
                        PARTITION BY page_id
                        ORDER BY timestamp
                        ROWS BETWEEN UNBOUNDED PRECEDING
                            AND UNBOUNDED FOLLOWING
                    ) AS language
                FROM his_pages_info
                WHERE timestamp <= '{ts}'
            ),
            -- Last known business of all pages until timestamp
            page_buss AS (
            SELECT DISTINCT
                page_id,
                last_value(business) OVER (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING
                        AND UNBOUNDED FOLLOWING
                ) AS business
                FROM his_pages_info
                WHERE timestamp <= '{ts}'
            ),
            -- Last known category of all pages until timestamp
            page_cat AS (
            SELECT DISTINCT
                page_id,
                last_value(category) OVER (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING
                        AND UNBOUNDED FOLLOWING
                ) AS category
                FROM his_pages_info
                WHERE timestamp <= '{ts}'
            ),
            feedback_keyfigs (name, value) AS (
                SELECT
                    'feedback_pos_total',
                    iif(per_chk.ok, sum(pos_cnt), 'x')
                FROM per_feedback, per_chk
                    UNION
                SELECT
                    'feedback_pos_lang_' || ifnull(language, 'None'),
                    iif(per_chk.ok, sum(pos_cnt), 'x')
                FROM per_feedback, per_chk
                LEFT JOIN page_lang USING (page_id)
                GROUP BY language
                    UNION
                SELECT
                    'feedback_pos_buss_' || ifnull(business, 'None'),
                    iif(per_chk.ok, sum(pos_cnt), 'x')
                FROM per_feedback, per_chk
                LEFT JOIN page_buss USING (page_id)
                GROUP BY business
                    UNION
                SELECT
                    'feedback_pos_cat_' || ifnull(category, 'None'),
                    iif(per_chk.ok, sum(pos_cnt), 'x')
                FROM per_feedback, per_chk
                LEFT JOIN page_cat USING (page_id)
                GROUP BY category
                    UNION
                SELECT
                    'feedback_neg_total',
                    iif(per_chk.ok, sum(neg_cnt), 'x')
                FROM per_feedback, per_chk
                    UNION
                SELECT
                    'feedback_neg_lang_' || ifnull(language, 'None'),
                    iif(per_chk.ok, sum(neg_cnt), 'x')
                FROM per_feedback, per_chk
                LEFT JOIN page_lang USING (page_id)
                GROUP BY language
                    UNION
                SELECT
                    'feedback_neg_buss_' || ifnull(business, 'None'),
                    iif(per_chk.ok, sum(neg_cnt), 'x')
                FROM per_feedback, per_chk
                LEFT JOIN page_buss USING (page_id)
                GROUP BY business
                    UNION
                SELECT
                    'feedback_neg_cat_' || ifnull(category, 'None'),
                    iif(per_chk.ok, sum(neg_cnt), 'x')
                FROM per_feedback, per_chk
                LEFT JOIN page_cat USING (page_id)
                GROUP BY category
                    UNION
                SELECT
                    'feedback_sat-rate_per',
                    iif(per_chk.ok,
                        CAST(sum(pos_cnt) AS REAL) 
                            / (sum(pos_cnt) + sum(neg_cnt)), 
                        'x')
                FROM per_feedback, per_chk
                    UNION
                SELECT
                    'feedback_sat-rate_ytd',
                    iif(ytd_chk.ok,
                        CAST(pos_cnt AS REAL) / (pos_cnt + neg_cnt), 
                        'x')
                FROM ytd_fb_tot, ytd_chk
            )
        SELECT name, value
        FROM feedback_keyfigs
        '''
    return {name: value for name, value in mst_conn.execute(qry).fetchall()}

Get feedback key figures for the period.

Arguments:

ts: timestamp of the scrape [yymmdd-hhmm] that defines the (exclusive)
    end of the period
first_date: first day of the period
last_date: last day of the period

Returns:

name, value dictionary of the key figures

Next positive and negative feedback key figures are returned:

  • feedback_pos_total
  • feedback_neg_total
  • feedback_sat-rate_per
  • feedback_sat-rate_ytd
  • feedback__lang_<language> per language
  • feedback__buss_<business> per business
  • feedback__cat_<category> per category

In case the days table of the metrics master database does not cover all days of the period, the key figures will be returned with a (textual) value of 'x'.

#   def rempages_keyfigs(rep_ts: str, ref_ts: str) -> dict[str, int]:
View Source
def rempages_keyfigs(rep_ts: str, ref_ts: str) -> dict[str, int]:
    """
    **Get key figures for removed pages since reference scrape.**

    Arguments:

        rep_ts: timestamp of the reporting scrape [yymmdd-hhmm]
        ref_ts: timestamp of the reference scrape [yymmdd-hhmm]

    Returns:

        name, value dictionary of the key figures

    Next key figures are returned:

    - pages_rem: total number of removed pages
    - pages_lan_`<language>`_rem: number of removed pages per *language*
    - pages_buss_`<business>`_rem: number of removed pages per *business*
    - pages_cat_`<category>`_rem: number of removed pages per *category*

    When both timestamps (`rep_ts` and `ref_ts`) are the same, the key
    figures will be returned with a (textual) value of 'x'.
    """

    # Handle special cases
    if not ref_ts or not rep_ts or ref_ts > rep_ts:
        raise ValueError('invalid period')
    if ref_ts == rep_ts:
        qry = f'''
            WITH
                -- Last known different combinations of language, business and
                -- category until reporting timestamp.
                lan_bus_cat_combis AS (
                    SELECT DISTINCT
                        last_value(language) OVER win AS language,
                        last_value(business) OVER win AS business,
                        last_value(category) OVER win AS category
                    FROM his_pages_info
                    WHERE timestamp <= '{rep_ts}'
                    WINDOW win AS (
                        PARTITION BY page_id
                        ORDER BY timestamp
                        ROWS BETWEEN UNBOUNDED PRECEDING
                            AND UNBOUNDED FOLLOWING
                    )
                ),
                rempages_keyfigs (name, value) AS (
                    SELECT 'pages_rem', 'x'
                        UNION
                    SELECT
                        'pages_lang_' || ifnull(language, 'None') || '_rem',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY language
                        UNION
                    SELECT
                        'pages_buss_' || ifnull(business, 'None') || '_rem',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY business
                        UNION
                    SELECT
                        'pages_cat_' || ifnull(category, 'None') || '_rem',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY category
                )
            SELECT name, value
            FROM rempages_keyfigs'''
        logger.info('key figures for removed pages requested for '
                    'period with zero length')
        return {name: value for name, value in mst_conn.execute(qry).fetchall()}

    # Store the key figures for the regular situation
    qry = f'''
        WITH
            -- All timestamps within the reporting period
            period AS (
                SELECT timestamp
                FROM mst_scrapes
                WHERE timestamp > '{ref_ts}'
                    AND timestamp <= '{rep_ts}'
            ),
            -- All removed pages in period
            removed_pages AS (
                SELECT page_id
                FROM his_pages_life
                WHERE timestamp in period
                    AND NOT alive
            ),
            -- Language, business and category aspects of the removed pages
            lan_bus_cat_aspects AS (
                SELECT DISTINCT
                    page_id,
                    last_value(language) OVER win AS language,
                    last_value(business) OVER win AS business,
                    last_value(category) OVER win AS category
                FROM his_pages_info
                WHERE timestamp <= '{rep_ts}' AND page_id IN removed_pages
                WINDOW win AS (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING
                        AND UNBOUNDED FOLLOWING
                )
            ),
            rempages_keyfigs (name, value) AS (
                SELECT 'pages_rem', count(*)
                FROM removed_pages
                    UNION
                SELECT
                    'pages_lang_' || ifnull(language, 'None') || '_rem',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY language
                    UNION
                SELECT
                    'pages_buss_' || ifnull(business, 'None') || '_rem',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY business
                    UNION
                SELECT
                    'pages_cat_' || ifnull(category, 'None') || '_rem',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY category
            )
        SELECT name, value
        FROM rempages_keyfigs'''
    return {name: value for name, value in mst_conn.execute(qry).fetchall()}

Get key figures for removed pages since reference scrape.

Arguments:

rep_ts: timestamp of the reporting scrape [yymmdd-hhmm]
ref_ts: timestamp of the reference scrape [yymmdd-hhmm]

Returns:

name, value dictionary of the key figures

Next key figures are returned:

  • pages_rem: total number of removed pages
  • pages_lan_<language>_rem: number of removed pages per language
  • pages_buss_<business>_rem: number of removed pages per business
  • pages_cat_<category>_rem: number of removed pages per category

When both timestamps (rep_ts and ref_ts) are the same, the key figures will be returned with a (textual) value of 'x'.

#   def newpages_keyfigs(rep_ts: str, ref_ts: str) -> dict[str, int]:
View Source
def newpages_keyfigs(rep_ts: str, ref_ts: str) -> dict[str, int]:
    """
    **Get key figures for new pages since reference scrape.**

    Arguments:

        rep_ts: timestamp of the reporting scrape [yymmdd-hhmm]
        ref_ts: timestamp of the reference scrape [yymmdd-hhmm]

    Returns:

        name, value dictionary of the key figures

    Next key figures are returned:

    - pages_new: total number of new pages
    - pages_lan_`<language>`_new: number of new pages per *language*
    - pages_buss_`<business>`_new: number of new pages per *business*
    - pages_cat_`<category>`_new: number of new pages per *category*

    When both timestamps (`rep_ts` and `ref_ts`) are the same, the key
    figures will be returned with a (textual) value of 'x'.
    """

    # Handle special cases
    if not ref_ts or not rep_ts or ref_ts > rep_ts:
        raise ValueError('invalid period')
    if ref_ts == rep_ts:
        qry = f'''
            WITH
                -- Last known different combinations of language, business and
                -- category until reporting timestamp.
                lan_bus_cat_combis AS (
                    SELECT DISTINCT
                        last_value(language) OVER win AS language,
                        last_value(business) OVER win AS business,
                        last_value(category) OVER win AS category
                    FROM his_pages_info
                    WHERE timestamp <= '{rep_ts}'
                    WINDOW win AS (
                        PARTITION BY page_id
                        ORDER BY timestamp
                        ROWS BETWEEN UNBOUNDED PRECEDING
                            AND UNBOUNDED FOLLOWING
                    )
                ),
                newpages_keyfigs (name, value) AS (
                    SELECT 'pages_new', 'x'
                        UNION
                    SELECT
                        'pages_lang_' || ifnull(language, 'None') || '_new',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY language
                        UNION
                    SELECT
                        'pages_buss_' || ifnull(business, 'None') || '_new',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY business
                        UNION
                    SELECT
                        'pages_cat_' || ifnull(category, 'None') || '_new',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY category
                )
            SELECT name, value
            FROM newpages_keyfigs'''
        logger.info(
            'key figures for new pages requested for period with zero length')
        return {name: value for name, value in mst_conn.execute(qry).fetchall()}

    # Store the key figures for the regular situation
    qry = f'''
        WITH
            -- All timestamps within the reporting period
            period AS (
                SELECT timestamp
                FROM mst_scrapes
                WHERE timestamp > '{ref_ts}'
                    AND timestamp <= '{rep_ts}'
            ),
            -- All new pages in period
            new_pages AS (
                SELECT page_id
                FROM his_pages_life
                WHERE timestamp in period
                    AND alive
            ),
            -- Language, business and category aspects of the new pages
            lan_bus_cat_aspects AS (
                SELECT DISTINCT
                    page_id,
                    last_value(language) OVER win AS language,
                    last_value(business) OVER win AS business,
                    last_value(category) OVER win AS category
                FROM his_pages_info
                WHERE timestamp <= '{rep_ts}' AND page_id IN new_pages
                WINDOW win AS (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING
                        AND UNBOUNDED FOLLOWING
                )
            ),
            newpages_keyfigs (name, value) AS (
                SELECT 'pages_new', count(*)
                FROM new_pages
                    UNION
                SELECT
                    'pages_lang_' || ifnull(language, 'None') || '_new',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY language
                    UNION
                SELECT
                    'pages_buss_' || ifnull(business, 'None') || '_new',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY business
                    UNION
                SELECT
                    'pages_cat_' || ifnull(category, 'None') || '_new',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY category
            )
        SELECT name, value
        FROM newpages_keyfigs'''
    return {name: value for name, value in mst_conn.execute(qry).fetchall()}

Get key figures for new pages since reference scrape.

Arguments:

rep_ts: timestamp of the reporting scrape [yymmdd-hhmm]
ref_ts: timestamp of the reference scrape [yymmdd-hhmm]

Returns:

name, value dictionary of the key figures

Next key figures are returned:

  • pages_new: total number of new pages
  • pages_lan_<language>_new: number of new pages per language
  • pages_buss_<business>_new: number of new pages per business
  • pages_cat_<category>_new: number of new pages per category

When both timestamps (rep_ts and ref_ts) are the same, the key figures will be returned with a (textual) value of 'x'.

#   def modpages_keyfigs( rep_ts: str, ref_ts: str, min_modification: float = 0.005 ) -> dict[str, int]:
View Source
def modpages_keyfigs(rep_ts: str, ref_ts: str,
                     min_modification: float = 0.005) -> dict[str, int]:
    """
    **Get key figures for page modifications since reference scrape.**

    Arguments:

        rep_ts: timestamp of the reporting scrape [yymmdd-hhmm]
        ref_ts: timestamp of the reference scrape [yymmdd-hhmm]
        min_modification: modification factor threshold below which
            modifications will not be counted

    Returns:

        name, value dictionary of the key figures

    For all pages alive at some moment within the period defined by
    `ref_ts` and `rep_ts`, the next key figures are returned for pages from
    which the editorial content was, or was not modified above the
    `min_modification` threshold:

    - pages_mod: total number of modified pages
    - pages_lan_`<language>`_mod: number of modified pages per *language*
    - pages_buss_`<business>`_mod: number of modified pages per *business*
    - pages_cat_`<category>`_mod: number of modified pages per *category*
    - pages_unm: total number of unmodified pages
    - pages_lan_`<language>`_unm: number of unmodified pages per *language*
    - pages_buss_`<business>`_unm: number of unmodified pages per *business*
    - pages_cat_`<category>`_unm: number of unmodified pages per *category*

    When both timestamps (`rep_ts` and `ref_ts`) are the same, the key
    figures will be returned with a (textual) value of 'x'.
    """

    # Handle the special cases
    if not ref_ts or not rep_ts or ref_ts > rep_ts:
        raise ValueError('invalid period')
    if ref_ts == rep_ts:
        qry = f'''
            WITH
                -- Last known different combinations of language, business and
                -- category until reporting timestamp.
                lan_bus_cat_combis AS (
                    SELECT DISTINCT
                        last_value(language) OVER win AS language,
                        last_value(business) OVER win AS business,
                        last_value(category) OVER win AS category
                    FROM his_pages_info
                    WHERE timestamp <= '{rep_ts}'
                    WINDOW win AS (
                        PARTITION BY page_id
                        ORDER BY timestamp
                        ROWS BETWEEN UNBOUNDED PRECEDING
                            AND UNBOUNDED FOLLOWING
                    )
                ),
                mod_keyfigs (name, value) AS (
                    SELECT 'pages_mod', 'x'
                        UNION
                    SELECT
                        'pages_lang_' || ifnull(language, 'None') || '_mod',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY language
                        UNION
                    SELECT
                        'pages_buss_' || ifnull(business, 'None') || '_mod',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY business
                        UNION
                    SELECT
                        'pages_cat_' || ifnull(category, 'None') || '_mod',
                        'x'
                    FROM lan_bus_cat_combis
                    GROUP BY category
                )
            SELECT name, value
            FROM mod_keyfigs'''
        logger.info('key factors for editorial text modifications requested '
                    'for period with zero length')
        mod_keyfigs = {name: value for name, value
                       in mst_conn.execute(qry).fetchall()}
        unm_keyfigs = {}
        for mod_name, mod_value in mod_keyfigs.items():
            unm_name = mod_name[:-3] + 'unm'
            unm_keyfigs[unm_name] = 'x'
        return mod_keyfigs | unm_keyfigs

    # Create table with modifications of editorial texts within period
    mst_conn.execute(f'''
        CREATE TEMPORARY TABLE edtext_mods AS
        WITH
            -- All timestamps within the reporting period
            period AS (
                SELECT timestamp
                FROM mst_scrapes
                WHERE timestamp > '{ref_ts}'
                    AND timestamp <= '{rep_ts}'
            ),
            -- All stored ed_text values per page, together with their
            -- previous value.
            curr_and_prev_edtext AS (
                SELECT page_id, timestamp, ed_text,
                    first_value(timestamp) OVER win1 AS prev_timestamp,
                    first_value(ed_text) OVER win1 AS prev_ed_text
                FROM his_pages_info
                WINDOW win1 AS (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS 1 PRECEDING
                )
            ),
            -- Language, business and category aspects of all pages
            lan_bus_cat_aspects AS (
                SELECT DISTINCT
                    page_id,
                    last_value(language) OVER win2 AS language,
                    last_value(business) OVER win2 AS business,
                    last_value(category) OVER win2 AS category
                FROM his_pages_info
                WHERE timestamp <= '{rep_ts}'
                WINDOW win2 AS (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING
                        AND UNBOUNDED FOLLOWING
                )
            )
        SELECT DISTINCT
            page_id, language, business, category,
            first_value(prev_ed_text) OVER win3 AS begin_edtext,
            last_value(ed_text) OVER win3 AS end_edtext,
            0 AS mod_fact
        FROM curr_and_prev_edtext
        LEFT JOIN lan_bus_cat_aspects USING (page_id)
        WHERE ed_text != prev_ed_text AND timestamp IN period
        WINDOW win3 AS (
            PARTITION BY page_id
            ORDER BY timestamp
            ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
        )
        ''')
    # Calculate and update the modification factors
    qry = '''
        SELECT page_id, begin_edtext, end_edtext
        FROM edtext_mods
        '''
    for rec in mst_conn.execute(qry).fetchall():
        mst_conn.execute(f'''
            UPDATE edtext_mods
            SET mod_fact = {mod_factor(rec[1], rec[2])}
            WHERE page_id = {rec[0]}
            ''')
    # Count the page modification key figures
    qry = f'''
        WITH
            edtext_keyfigs (name, value) AS (
                SELECT 'pages_mod', count(*)
                FROM edtext_mods
                WHERE mod_fact >= {min_modification}
                    UNION
                SELECT
                    'pages_lang_' || ifnull(language, 'None') || '_mod',
                    count(*)
                FROM edtext_mods
                WHERE mod_fact >= {min_modification}
                GROUP BY language
                    UNION
                SELECT
                    'pages_buss_' || ifnull(business, 'None') || '_mod',
                    count(*)
                FROM edtext_mods
                WHERE mod_fact >= {min_modification}
                GROUP BY business
                    UNION
                SELECT
                    'pages_cat_' || ifnull(category, 'None') || '_mod',
                    count(*)
                FROM edtext_mods
                WHERE mod_fact >= {min_modification}
                GROUP BY category
            )
        SELECT name, value
        FROM edtext_keyfigs
        '''
    mod_keyfigs = {name: value for name, value
                   in mst_conn.execute(qry).fetchall()}
    mst_conn.execute('DROP TABLE edtext_mods')
    # Get key figures for living pages in period as intermediate result
    qry = f'''
        WITH
            -- All timestamps within the reporting period
            period AS (
                SELECT timestamp
                FROM mst_scrapes
                WHERE timestamp > '{ref_ts}'
                    AND timestamp <= '{rep_ts}'
            ),
            -- Page alive values at start of period
            life_at_start AS (
                SELECT DISTINCT
                    page_id,
                    last_value(alive) OVER (
                        PARTITION BY page_id
                        ORDER BY timestamp
                        ROWS BETWEEN UNBOUNDED PRECEDING
                            AND UNBOUNDED FOLLOWING
                    ) AS alive
                FROM his_pages_life
                WHERE timestamp <= '{ref_ts}'
            ),
            -- Pages alive at some moment in period
            alive_in_period AS (
                -- Pages alive at start of period
                SELECT page_id
                FROM life_at_start
                WHERE alive
                    UNION
                -- All new pages in period
                SELECT page_id
                FROM his_pages_life
                WHERE timestamp in period AND alive
            ),
            -- Language, business and category aspects of pages alive in period
            lan_bus_cat_aspects AS (
                SELECT DISTINCT
                    page_id,
                    last_value(language) OVER win AS language,
                    last_value(business) OVER win AS business,
                    last_value(category) OVER win AS category
                FROM his_pages_info
                WHERE timestamp <= '{rep_ts}' AND page_id IN alive_in_period
                WINDOW win AS (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING
                        AND UNBOUNDED FOLLOWING
                )
            ),
            -- Key figures for pages alive in period (not used in report)
            living_keyfigs (name, value) AS (
                SELECT 'pages_liv', count(*)
                FROM alive_in_period
                    UNION
                SELECT
                    'pages_lang_' || ifnull(language, 'None') || '_liv',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY language
                    UNION
                SELECT
                    'pages_buss_' || ifnull(business, 'None') || '_liv',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY business
                    UNION
                SELECT
                    'pages_cat_' || ifnull(category, 'None') || '_liv',
                    count(*)
                FROM lan_bus_cat_aspects
                GROUP BY category
            )
        SELECT name, value
        FROM living_keyfigs
        '''
    liv_keyfigs = {name: value for name, value
                   in mst_conn.execute(qry).fetchall()}
    unm_keyfigs = {}
    for liv_name, liv_val in liv_keyfigs.items():
        mod_name = liv_name[:-3] + 'mod'
        mod_val = mod_keyfigs.get(mod_name, 0)
        unm_name = liv_name[:-3] + 'unm'
        unm_val = liv_val - mod_val
        unm_keyfigs[unm_name] = unm_val
    # Return the (un)modified key figures, but not the living
    return mod_keyfigs | unm_keyfigs

Get key figures for page modifications since reference scrape.

Arguments:

rep_ts: timestamp of the reporting scrape [yymmdd-hhmm]
ref_ts: timestamp of the reference scrape [yymmdd-hhmm]
min_modification: modification factor threshold below which
    modifications will not be counted

Returns:

name, value dictionary of the key figures

For all pages alive at some moment within the period defined by ref_ts and rep_ts, the next key figures are returned for pages from which the editorial content was, or was not modified above the min_modification threshold:

  • pages_mod: total number of modified pages
  • pages_lan_<language>_mod: number of modified pages per language
  • pages_buss_<business>_mod: number of modified pages per business
  • pages_cat_<category>_mod: number of modified pages per category
  • pages_unm: total number of unmodified pages
  • pages_lan_<language>_unm: number of unmodified pages per language
  • pages_buss_<business>_unm: number of unmodified pages per business
  • pages_cat_<category>_unm: number of unmodified pages per category

When both timestamps (rep_ts and ref_ts) are the same, the key figures will be returned with a (textual) value of 'x'.

#   def keyfig_details(language: str = 'english') -> dict[str, list[int, str, str]]:
View Source
def keyfig_details(language: str = 'english') -> dict[str, list[int, str, str]]:
    """
    **Get details for all potential key figures.**

    Arguments:

        language: language of the description to return; 'dutch' will return
            Dutch descriptions, any other value will return them in English

    Returns:

        seq_nr, cluster, description per key figure

    The details are read from the configured xlsx-file `kf_details_name` which
    has five fields per line:

    - sequence number for ordering
    - key figure name
    - cluster name for visual grouping
    - dutch description
    - english description
    """

    file = Path(report_conf.kf_details_name)
    ws = openpyxl.load_workbook(file).active
    kf_details = {}
    for row_nr, (seq_nr, name, cluster, dutch, english) \
            in enumerate(ws.values, start=1):
        if row_nr == 1:
            # Skip row with field names
            continue
        kf_details[name] = [
            int(seq_nr), cluster, dutch if language == 'dutch' else english
        ]
    return kf_details

Get details for all potential key figures.

Arguments:

language: language of the description to return; 'dutch' will return
    Dutch descriptions, any other value will return them in English

Returns:

seq_nr, cluster, description per key figure

The details are read from the configured xlsx-file kf_details_name which has five fields per line:

  • sequence number for ordering
  • key figure name
  • cluster name for visual grouping
  • dutch description
  • english description
#   def data_legend_sheet(wb: bd_www.report.ReportWorkbook) -> None:
View Source
def data_legend_sheet(wb: ReportWorkbook) -> None:
    """
    **Add report sheet with column legend.**

    Arguments:

        wb: workbook in which the new worksheet is created

    The sheet gives descriptions of columns in the sheets of the report that
    may not be self explanatory. These descriptions are read from the
    xlsx-file that is configured via the `data_legend_name` field in the [
    REPORT] section of the configuration file (see documentation of
    `bd_www.Config`).
    """

    dl_file = Path(report_conf.data_legend_name)
    ws = openpyxl.load_workbook(dl_file).active
    legend_details = [row for row in ws.values]
    del legend_details[0]  # Remove row with field names
    wb.add_datasheet('Data legend', legend_details,
                     shading='cluster', cluster_col=1, cluster_del=True)

Add report sheet with column legend.

Arguments:

wb: workbook in which the new worksheet is created

The sheet gives descriptions of columns in the sheets of the report that may not be self explanatory. These descriptions are read from the xlsx-file that is configured via the data_legend_name field in the [ REPORT] section of the configuration file (see documentation of bd_www.Config).

#   def rem_pages_sheet( wb: bd_www.report.ReportWorkbook, rep_ts: str, ref_ts: str, incl_txt_fb: bool = True ) -> None:
View Source
def rem_pages_sheet(wb: ReportWorkbook,
                    rep_ts: str, ref_ts: str, incl_txt_fb: bool = True) -> None:
    """
    **Add report sheet with data of removed pages.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period
        incl_txt_fb: if False, no textual feedback will be included

    This sheet lists the details of all pages that were removed since the
    reference scrape.
    """

    # Set (inclusive) dates for the reporting period
    rep_end = ts_to_d(rep_ts, -1)
    rep_start = ts_to_d(ref_ts) if ref_ts else rep_end

    # Create temporary views
    page_themes()
    per_metrics(rep_start, rep_end)
    per_feedback(rep_start, rep_end)

    qry = f'''
        WITH
            -- Pages removed in the period with the timestamp of removal
            page_removals (page_id, rem_ts) AS (
                SELECT page_id, timestamp
                FROM his_pages_life
                WHERE alive = 0
                    AND timestamp > '{ref_ts}'
                    AND timestamp <= '{rep_ts}'
            ),
            -- Info timestamp of the removed pages
            rem_pages_info_ts (page_id, rem_ts, info_ts) AS (
                SELECT DISTINCT
                    page_id, rem_ts,
                    last_value(timestamp) OVER (
                        PARTITION BY page_id, rem_ts
                        ORDER BY timestamp
                        ROWS BETWEEN UNBOUNDED PRECEDING
                            AND UNBOUNDED FOLLOWING
                    )
                FROM page_removals
                LEFT JOIN his_pages_info USING (page_id)
                WHERE timestamp <= rem_ts
            ),
            -- All aspects of the removed pages
            removed_pages_aspects AS (
                SELECT rem_ts, i.*
                FROM rem_pages_info_ts AS r
                LEFT JOIN his_pages_info AS i
                    ON r.page_id = i.page_id AND r.info_ts = i.timestamp
            )
        SELECT
            rem_ts, page_id, path, 
            title, description, first_h1, language, modified, pagetype,
            classes, theme, business, category,
            ed_text, aut_text,
            visits, views, entries, bounces, bounce_rate, exits, exit_rate,
            organic_entries, organic_entry_rate, call_visits, call_rate,
            neg_fb_cnt, pos_fb_cnt,
            ifnull(CAST (neg_fb_cnt + pos_fb_cnt AS REAL) / visits,0)
                AS fb_rate,
            neg_fb_pct, pos_fb_pct, neg_txts, pos_txts
        FROM removed_pages_aspects
        LEFT JOIN mst_paths USING (page_id)
        LEFT JOIN temp.themes USING (page_id)
        LEFT JOIN temp.per_metrics USING (page_id)
        LEFT JOIN temp.per_feedback USING (page_id)
        ORDER BY page_id, timestamp
        '''
    data = []
    # Calculate and insert feedback data into the query result
    for cells in mst_conn.execute(qry).fetchall():
        cells = list(cells)
        # Reformat or delete textual feedback
        if incl_txt_fb:
            if cells[31]:
                cells[31] = cell_feedbacks(prep_feedbacks(cells[31]))
            if cells[32]:
                cells[32] = cell_feedbacks(prep_feedbacks(cells[32]))
        else:
            del cells[31:33]
        data.append(cells)
    wb.add_datasheet('Removed pages', data, incl_txt_fb)

Add report sheet with data of removed pages.

Arguments:

wb: workbook in which the new worksheet is created
rep_ts: timestamp of the scrape that marks the (exclusive) end of the
    reporting period
ref_ts: timestamp of the scrape that marks the (inclusive) start of the
    reporting period and the (exclusive) end of the reference period
incl_txt_fb: if False, no textual feedback will be included

This sheet lists the details of all pages that were removed since the reference scrape.

#   def page_themes() -> None:
View Source
def page_themes() -> None:
    """
    **Create view with theme of each page.**

    A new temporary view `themes` is created with the fields `page_id` and
    `theme`. The value of `theme` is deducted from the page path according to
    the following patterns (in which `<lc>` is a two-letter language code):

    - /bldcontent`<lc>`/berichten/<theme>
    - /bldcontent`<lc>`/themaoverstijgend/<theme>
    - /bldcontent`<lc>`/belastingdienst/<segment>/<theme>
    - /bldcontent`<lc>`/standaard_functies/<segment>/<theme>
    - /bldcontent`<lc>`/events/<segment>/<theme>
    - /bldcontent`<lc>`/niet_in_enig_menu/<segment>/<theme>
    - /bldcontent`<lc>`/campagnes/landingspaginas/<segment>/<theme>
    - /bldsysteem/... : theme 'systeem'

    Since the view does not depend on scrape or period, it will not be
    replaced when it is available already.

    The view works upon the scrapes database, which is connected via the
    global master connection `mst_conn`.
    """

    mst_conn.execute('''
        CREATE TEMPORARY VIEW IF NOT EXISTS themes AS
        WITH
            -- CTE to extract the first five segments from the page path
            one_segment AS (
                SELECT
                    page_id,
                    substr(path, 2, instr(substring(path, 2), '/')-1)
                        AS segment1,
                    substr(substring(path, 2),
                           instr(substring(path, 2), '/')+1) || '/' AS rest
                FROM mst_paths
            ),
            two_segments AS (
                SELECT
                    page_id, segment1,
                    substr(rest, 1, instr(rest, '/')-1) AS segment2,
                    substr(rest, instr(rest, '/')+1) AS rest
                FROM one_segment
            ),
            three_segments AS (
                SELECT
                    page_id, segment1, segment2,
                    substr(rest, 1, instr(rest, '/')-1) AS segment3,
                    substr(rest, instr(rest, '/')+1) AS rest
                FROM two_segments
            ),
            four_segments AS (
                SELECT
                    page_id, segment1, segment2, segment3,
                    substr(rest, 1, instr(rest, '/')-1) AS segment4,
                    substr(rest, instr(rest, '/')+1) AS rest
                FROM three_segments
            ),
            five_segments AS (
                SELECT
                    page_id, segment1, segment2, segment3, segment4,
                    substr(rest, 1, instr(rest, '/')-1) AS segment5
                FROM four_segments
            )
        -- Theme for urls like /<lc>/<theme>
        SELECT
            page_id, segment2 AS theme
        FROM five_segments
        WHERE segment1 LIKE '__'
        UNION
        -- Theme for urls like:
        --    /bldcontent<lc>/berichten/<theme>
        --    /bldcontent<lc>/themaoverstijgend/<theme>
        SELECT
            page_id, segment3 AS theme
        FROM five_segments
        WHERE segment1 LIKE 'bldcontent__'
            AND segment2 IN ('berichten', 'themaoverstijgend')
        UNION
        -- theme for urls like:
        --    /bldcontent<lc>/belastingdienst/<segment>/<theme>
        --    /bldcontent<lc>/standaard_functies/<segment>/<theme>
        --    /bldcontent<lc>/events/<segment>/<theme>
        --    /bldcontent<lc>/niet_in_enig_menu/<segment>/<theme>
        SELECT
            page_id, segment4 AS theme
        FROM five_segments
        WHERE segment1 LIKE 'bldcontent__'
            AND segment2 IN ('belastingdienst', 'standaard_functies',
                             'events', 'niet_in_enig_menu')
        UNION
        -- theme for urls like:
        --    /bldcontent<lc>/campagnes/landingspaginas/<segment>/<theme>
        SELECT
            page_id, segment5 AS theme
        FROM five_segments
        WHERE segment1 LIKE 'bldcontent__'
            AND segment2 = 'campagnes'
            AND segment3 = 'landingspaginas'
        UNION
        -- theme 'systeem' for urls like:
        --    /bldsysteem 
        SELECT
            page_id, 'systeem' AS theme
        FROM four_segments
        WHERE segment1 = 'bldsysteem'
        ''')

Create view with theme of each page.

A new temporary view themes is created with the fields page_id and theme. The value of theme is deducted from the page path according to the following patterns (in which <lc> is a two-letter language code):

  • /bldcontent<lc>/berichten/
  • /bldcontent<lc>/themaoverstijgend/
  • /bldcontent<lc>/belastingdienst//
  • /bldcontent<lc>/standaard_functies//
  • /bldcontent<lc>/events//
  • /bldcontent<lc>/niet_in_enig_menu//
  • /bldcontent<lc>/campagnes/landingspaginas//
  • /bldsysteem/... : theme 'systeem'

Since the view does not depend on scrape or period, it will not be replaced when it is available already.

The view works upon the scrapes database, which is connected via the global master connection mst_conn.

#   def per_metrics(first_date: datetime.date, last_date: datetime.date) -> None:
View Source
def per_metrics(first_date: dt.date, last_date: dt.date) -> None:
    """
    **Create temporary view with metrics for all pages alive in period.**

    Arguments:

        first_date: first day of the period
        last_date: last day of the period

    A new temporary view `per_metrics` is created with the next values for
    all pages alive at some moment during the period:

    - `page_id`: the id of the page
    - `visits`: number of visits that included this page [INTEGER]
    - `views`: number of times this page has been requested [INTEGER]
    - `entries`: number of visits that started on this page [INTEGER]
    - `bounces`: number of visits that only viewed this page [INTEGER]
    - `exits`: number of visits that ended on this page [INTEGER]
    - `bounce_rate`: fraction of entries on this page that bounced
      [REAL: `bounces` / `entries`]
    - `exit_rate`: fraction of visits that that exited on this page
      [REAL: `exits` / `visits`]
    - `organic_entries`: number of entries that originated from an external
      search engine [INTEGER]
    - `organic_entry_rate`: fraction of visits that started on the page and
      originated from an external search engine
      [REAL: `organic_entries` / `visits`]
    - `call_visits`: number of visits that included this page as well as a page
      with calling information [INTEGER]
    - `call_rate`: fraction of visits hat included a page with calling
      information [REAL: `call_visits` / `visits`]

    An existing view with the same name will be overwritten.

    The view works upon the metrics database, which is connected via the
    global master connection `mst_conn`.
    """

    per_pages(first_date, last_date)
    mst_conn.execute('DROP VIEW IF EXISTS per_metrics')
    mst_conn.execute(f'''
        CREATE TEMPORARY VIEW per_metrics AS
        WITH
            -- Dates in period
            per_days AS (
                SELECT date_id
                FROM dates
                WHERE date >= '{first_date}' AND date <= '{last_date}'
            ),
            -- Available metrics
            per_basic_metrics AS (
                SELECT DISTINCT
                    page_id,
                    sum(nb_visits) AS visits,
                    sum(nb_hits) AS views,
                    sum(entry_nb_visits) AS entries,
                    sum(entry_bounce_count) AS bounces,
                    sum(exit_nb_visits) AS exits,
                    sum(organic_entries) AS organic_entries,
                    sum(call_visits) AS call_visits
                FROM daily
                WHERE date_id IN per_days
                GROUP BY page_id
            ),
            -- Pages alive in period with available basic metrics
            per_pages_metrics AS (
                SELECT
                    page_id,
                    ifnull(visits,0) AS visits,
                    ifnull(views,0) AS views,
                    ifnull(entries,0) AS entries,
                    ifnull(bounces,0) AS bounces,
                    ifnull(exits,0) AS exits,
                    ifnull(organic_entries,0) AS organic_entries,
                    ifnull(call_visits,0) AS call_visits
                FROM per_pages
                LEFT JOIN per_basic_metrics USING (page_id)
            )
        -- Pages alive in period with basic and calculated metrics
        SELECT
            *,
            ifnull(round(CAST(bounces AS REAL) / entries, 3), 0) AS bounce_rate,
            ifnull(round(CAST(exits AS REAL) / visits, 3), 0) AS exit_rate,
            ifnull(round(CAST(organic_entries AS REAL) / visits, 3), 0)
                AS organic_entry_rate,
            ifnull(round(CAST(call_visits AS REAL) / visits, 3), 0) AS call_rate
        FROM per_pages_metrics
        ''')

Create temporary view with metrics for all pages alive in period.

Arguments:

first_date: first day of the period
last_date: last day of the period

A new temporary view per_metrics is created with the next values for all pages alive at some moment during the period:

  • page_id: the id of the page
  • visits: number of visits that included this page [INTEGER]
  • views: number of times this page has been requested [INTEGER]
  • entries: number of visits that started on this page [INTEGER]
  • bounces: number of visits that only viewed this page [INTEGER]
  • exits: number of visits that ended on this page [INTEGER]
  • bounce_rate: fraction of entries on this page that bounced [REAL: bounces / entries]
  • exit_rate: fraction of visits that that exited on this page [REAL: exits / visits]
  • organic_entries: number of entries that originated from an external search engine [INTEGER]
  • organic_entry_rate: fraction of visits that started on the page and originated from an external search engine [REAL: organic_entries / visits]
  • call_visits: number of visits that included this page as well as a page with calling information [INTEGER]
  • call_rate: fraction of visits hat included a page with calling information [REAL: call_visits / visits]

An existing view with the same name will be overwritten.

The view works upon the metrics database, which is connected via the global master connection mst_conn.

#   def per_pages(first_date: datetime.date, last_date: datetime.date) -> None:
View Source
def per_pages(first_date: dt.date, last_date: dt.date) -> None:
    """
    **Create temporary view with page_id's of all pages alive in period.**

    Arguments:

        first_date: first day of the period
        last_date: last day of the period

    An existing view with the same name will be overwritten.

    The view works upon the metrics database, which is connected via the
    global master connection `mst_conn`.
    """

    mst_conn.execute('DROP VIEW IF EXISTS per_pages')
    mst_conn.execute(f'''
        CREATE TEMPORARY VIEW per_pages AS
        WITH
            -- All timestamps between last scrape at or before first-date and 
            -- first scrape after last-date. 
            period (per_ts) AS (
                SELECT timestamp
                FROM mst_scrapes
                WHERE timestamp >= (SELECT max(timestamp)
                                    FROM mst_scrapes 
                                    WHERE date <= '{first_date}')
                    AND timestamp <= (SELECT min(timestamp) 
                                      FROM mst_scrapes 
                                      WHERE date > '{last_date}')
            ),
            -- Pages alive values at each period timestamp
            pages_alive_value (per_ts, page_id, alive) AS (
                SELECT DISTINCT
                    per_ts, page_id,
                    last_value(alive) OVER (
                        PARTITION BY per_ts, page_id
                        ORDER BY timestamp
                        ROWS BETWEEN UNBOUNDED PRECEDING
                            AND UNBOUNDED FOLLOWING
                    )
                FROM his_pages_life, period
                WHERE timestamp <= per_ts
            )
        -- Pages alive at some moment in period
        SELECT DISTINCT page_id
        FROM pages_alive_value
        WHERE alive
        ''')

Create temporary view with page_id's of all pages alive in period.

Arguments:

first_date: first day of the period
last_date: last day of the period

An existing view with the same name will be overwritten.

The view works upon the metrics database, which is connected via the global master connection mst_conn.

#   def per_feedback(first_date: datetime.date, last_date: datetime.date) -> None:
View Source
def per_feedback(first_date: dt.date, last_date: dt.date) -> None:
    """
    **Create temporary feedback data for all pages alive in period.**

    Arguments:

        first_date: first day of the period
        last_date: last day of the period

    A new temporary view `per_feedback` is created with the next values for
    all pages alive at some moment during the period:

    - `page_id`: the id of the page
    - `neg_fb_cnt`: number of negative feedbacks [INTEGER]
    - `pos_fb_cnt`: number of positive feedbacks [INTEGER]
    - `fb_cnt`: total number of feedbacks [INTEGER: `neg_fb_cnt` + `pos_fb_cnt`]
    - `neg_txts`: newline separated negative feedback texts [TEXT]
    - `pos_txts`: newline separated positive feedback texts [TEXT]
    - `neg_fb_pct`: percentage of all feedbacks that are negative
      [REAL: `neg_fb_cnt` / `fb_cnt`]
    - `pos_fb_pct`: percentage of all feedbacks that are positive
      [REAL: `pos_fb_cnt` / `fb_cnt`]

    An existing view with the same name will be overwritten.

    The view works upon the metrics database, which is connected via the
    global master connection `mst_conn`.
    """

    per_pages(first_date, last_date)
    mst_conn.execute('DROP VIEW IF EXISTS per_feedback')
    mst_conn.execute(f'''
        CREATE TEMPORARY VIEW per_feedback AS
        WITH
            -- Dates in period
            per_days AS (
                SELECT date_id
                FROM dates
                WHERE date >= '{first_date}' AND date <= '{last_date}'
            ),
            -- Available feedback data
            per_basic_feedback AS (
                SELECT
                    page_id,
                    sum(neg_cnt) AS neg_fb_cnt,
                    sum(pos_cnt) AS pos_fb_cnt,
                    sum(neg_cnt + pos_cnt) AS fb_cnt,
                    group_concat(neg_txt, char(10)) AS neg_txts,
                    group_concat(pos_txt, char(10)) AS pos_txts
                FROM feedback
                WHERE date_id IN per_days
                GROUP BY page_id
            ),
            -- Pages alive in period with available basic feedback data
            per_pages_feedback AS (
                SELECT
                    page_id,
                    ifnull(neg_fb_cnt,0) AS neg_fb_cnt,
                    ifnull(pos_fb_cnt,0) AS pos_fb_cnt,
                    ifnull(fb_cnt,0) AS fb_cnt,
                    neg_txts, pos_txts
                FROM per_pages
                LEFT JOIN per_basic_feedback USING (page_id)
            )
        -- Pages alive in period with basic and calculated feedback data
        SELECT
            *,
            ifnull(CAST (neg_fb_cnt AS REAL) / fb_cnt,0) AS neg_fb_pct,
            ifnull(CAST (pos_fb_cnt AS REAL) / fb_cnt,0) AS pos_fb_pct
        FROM per_pages_feedback
        ''')

Create temporary feedback data for all pages alive in period.

Arguments:

first_date: first day of the period
last_date: last day of the period

A new temporary view per_feedback is created with the next values for all pages alive at some moment during the period:

  • page_id: the id of the page
  • neg_fb_cnt: number of negative feedbacks [INTEGER]
  • pos_fb_cnt: number of positive feedbacks [INTEGER]
  • fb_cnt: total number of feedbacks [INTEGER: neg_fb_cnt + pos_fb_cnt]
  • neg_txts: newline separated negative feedback texts [TEXT]
  • pos_txts: newline separated positive feedback texts [TEXT]
  • neg_fb_pct: percentage of all feedbacks that are negative [REAL: neg_fb_cnt / fb_cnt]
  • pos_fb_pct: percentage of all feedbacks that are positive [REAL: pos_fb_cnt / fb_cnt]

An existing view with the same name will be overwritten.

The view works upon the metrics database, which is connected via the global master connection mst_conn.

#   def prep_feedbacks(combined_fbs: str) -> list[str]:
View Source
def prep_feedbacks(combined_fbs: str) -> list[str]:
    r"""
    **Prepare textual feedbacks.**

    Arguments:

        combined_fbs: newline concatenated feedbacks

    Returns:

        prepared feedbacks

    The feedbacks are deduplicated, cleansed and filtered according to the
    next rules:

    - leading and trailing whitespace and / (converted hard return) is
      removed
    - leading ! , . / < = > \ ) ] ^ _ ` ~ is removed
    - feedback consisting of only digits, punctuation and/or whitespace is
      discarded
    - feedback with only one kind of character is discarded
    - non-printable characters are removed
    - feedback containing any of the next texts is discarded
        - `<script>`
        - `</script>`
        - `javascript`

    The resulting set is lexicographically sorted with feedbacks starting
    with an alphabet character before the others.
    """

    def feedback_sorting_key(txt: str) -> str:
        """
        **Helper function to sort user feedback texts.**

        Arguments:

            txt: text item to be sorted

        Returns:

            modified input for sorting purposes

        Converts the input to lowercase and prepends it with ~ (the ascii
        character that sorts last) if the first character is non-alphabetic.
        """

        txt = txt.lower()
        if txt[0] not in string.ascii_letters:
            txt = '~' + txt
        return txt

    non_starters = r'!,./<=>\)]^_`~'
    forbidden_text = {'<script>', '</script>', 'javascript'}
    fbs = []
    for fb in list(set(combined_fbs.split('\n'))):
        fb = fb.lstrip(non_starters + string.whitespace)
        fb = fb.rstrip(string.whitespace + '/')
        if not fb.strip(string.punctuation + string.digits + string.whitespace):
            continue
        if len(set(fb.lower())) == 1:
            continue
        if not fb.isprintable():
            fb = ''.join(c for c in fb if c.isprintable())
        for text in forbidden_text:
            if text in fb:
                fb = ''
        if fb:
            fbs.append(fb)

    return sorted(fbs, key=feedback_sorting_key)

Prepare textual feedbacks.

Arguments:

combined_fbs: newline concatenated feedbacks

Returns:

prepared feedbacks

The feedbacks are deduplicated, cleansed and filtered according to the next rules:

  • leading and trailing whitespace and / (converted hard return) is removed
  • leading ! , . / < = > \ ) ] ^ _ ` ~ is removed
  • feedback consisting of only digits, punctuation and/or whitespace is discarded
  • feedback with only one kind of character is discarded
  • non-printable characters are removed
  • feedback containing any of the next texts is discarded
    • <script>
    • </script>
    • javascript

The resulting set is lexicographically sorted with feedbacks starting with an alphabet character before the others.

#   def cell_feedbacks(feedbacks: list[str]) -> str:
View Source
def cell_feedbacks(feedbacks: list[str]) -> str:
    """
    **Prepare textual feedbacks for worksheet cell.**

    Arguments:

        feedbacks: output from `prep_feedback` function

    Returns:

        feedbacks to fit one cell

    Newline concatenated feedbacks, potentially truncated, to fit one cell of
    a report sheet.
    """

    fbs = '\n'.join(feedbacks)
    if len(fbs) > _CELL_MAX:
        fbs = fbs[:fbs.rfind('\n', 0, _CELL_MAX)]
        shown_fb = fbs.count('\n') + 1
        fbs = f'[{shown_fb} of {len(feedbacks)} shown]\n' + fbs
    return fbs

Prepare textual feedbacks for worksheet cell.

Arguments:

feedbacks: output from `prep_feedback` function

Returns:

feedbacks to fit one cell

Newline concatenated feedbacks, potentially truncated, to fit one cell of a report sheet.

#   def new_pages_sheet( wb: bd_www.report.ReportWorkbook, rep_ts: str, ref_ts: str, incl_txt_fb: bool = True ) -> None:
View Source
def new_pages_sheet(wb: ReportWorkbook,
                    rep_ts: str, ref_ts: str, incl_txt_fb: bool = True) -> None:
    """
    **Add a report sheet with data of the new pages.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period
        incl_txt_fb: if False, no textual feedback will be included

    This sheet lists all relevant available and calculated data for each page
    that was added since the reference scrape.
    """

    # Set (inclusive) dates for the reporting period
    rep_end = ts_to_d(rep_ts, -1)
    rep_start = ts_to_d(ref_ts) if ref_ts else rep_end

    # Focus on specific scrape data and create temporary views
    with Scrape(rep_ts):
        page_themes()
        per_metrics(rep_start, rep_end)
        per_feedback(rep_start, rep_end)

        qry = f'''
            WITH
                -- Period timestamps
                period (per_ts) AS (
                    SELECT timestamp
                    FROM mst_scrapes
                    WHERE timestamp > '{ref_ts}'
                        AND timestamp <= '{rep_ts}'
                ),
                -- Pages alive values at each period timestamp
                pages_alive_value (per_ts, page_id, alive) AS (
                    SELECT DISTINCT
                        per_ts, page_id,
                        last_value(alive) OVER (
                            PARTITION BY per_ts, page_id
                            ORDER BY timestamp
                            ROWS BETWEEN UNBOUNDED PRECEDING
                                AND UNBOUNDED FOLLOWING
                        )
                    FROM his_pages_life, period
                    WHERE timestamp <= per_ts
                ),
                -- Pages alive at each period timestamp
                living_pages (per_ts, page_id) AS (
                    SELECT per_ts, page_id
                    FROM pages_alive_value
                    WHERE alive
                ),
                -- Titles of living pages at each period timestamp
                pages_titles (per_ts, page_id, title) AS (
                    SELECT DISTINCT
                        per_ts, page_id,
                        last_value(title) OVER (
                            PARTITION BY per_ts, page_id
                            ORDER BY timestamp
                            ROWS BETWEEN UNBOUNDED PRECEDING
                                AND UNBOUNDED FOLLOWING
                        )
                    FROM living_pages
                    LEFT JOIN his_pages_info USING (page_id)
                    WHERE timestamp <= per_ts
                ),
                -- Title frequency at each period timestamp
                title_freq (per_ts, page_id, freq) AS (
                    SELECT
                        per_ts, page_id,
                        count(page_id) OVER (
                            PARTITION BY per_ts, title
                            ROWS BETWEEN UNBOUNDED PRECEDING
                                AND UNBOUNDED FOLLOWING
                        )
                    FROM pages_titles
                ),
                -- Pages added in the period with the timestamp of addition
                page_additions (page_id, add_ts) AS (
                    SELECT page_id, timestamp
                    FROM his_pages_life
                    WHERE timestamp IN period AND alive
                ),
                -- Info timestamp of the added pages
                added_pages_info_ts (page_id, add_ts, info_ts) AS (
                    SELECT DISTINCT
                        page_id, add_ts,
                        last_value(timestamp) OVER (
                            PARTITION BY page_id, add_ts
                            ORDER BY timestamp
                            ROWS BETWEEN UNBOUNDED PRECEDING
                                AND UNBOUNDED FOLLOWING
                        )
                    FROM page_additions
                    LEFT JOIN his_pages_info USING (page_id)
                    WHERE timestamp <= add_ts
                ),
                -- All aspects of the added pages, including title frequency
                added_pages_aspects AS (
                    SELECT add_ts, freq, i.*
                    FROM added_pages_info_ts AS a
                    LEFT JOIN his_pages_info AS i
                        ON a.page_id = i.page_id AND a.info_ts = i.timestamp
                    LEFT JOIN title_freq AS t
                        ON a.page_id = t.page_id AND a.add_ts = t.per_ts
                )
            SELECT
                add_ts, page_id, path, 
                title, freq, description,
                iif(description ISNULL, 0, length(description)) AS descr_len,
                first_h1, num_h1s, language, modified, pagetype,
                classes, theme, business, category,
                ifnull(referral_cnt, iif(unl_type = 'orphan', '0!', 0))
                    AS num_ed_refs,
                ed_text, aut_text,
                visits, views, entries, bounces, bounce_rate, exits, exit_rate,
                organic_entries, organic_entry_rate, call_visits, call_rate,
                neg_fb_cnt, pos_fb_cnt,
                ifnull(CAST (neg_fb_cnt + pos_fb_cnt AS REAL) / visits,0)
                    AS fb_rate,
                neg_fb_pct, pos_fb_pct, neg_txts, pos_txts
            FROM added_pages_aspects
            LEFT JOIN mst_paths USING (page_id)
            LEFT JOIN temp.themes USING (page_id)
            LEFT JOIN tsd_int_ref_cnt USING (page_id)
            LEFT JOIN tsd_unlinked USING (page_id)
            LEFT JOIN temp.per_metrics USING (page_id)
            LEFT JOIN temp.per_feedback USING (page_id)
            ORDER BY page_id, add_ts
            '''
        data = []
        # Calculate and insert extra data into the query results
        for cells in mst_conn.execute(qry).fetchall():
            cells = list(cells)
            # Reformat or delete textual feedback
            if incl_txt_fb:
                if cells[35]:
                    cells[35] = cell_feedbacks(prep_feedbacks(cells[35]))
                if cells[36]:
                    cells[36] = cell_feedbacks(prep_feedbacks(cells[36]))
            else:
                del cells[35:37]
            # Insert number of words in editorial content
            ed_text = cells[17]
            ed_wrd_cnt = len(re.findall(r'\w+', ed_text))
            cells.insert(18, ed_wrd_cnt)
            # Insert number of words in first h1
            first_h1 = cells[7]
            words_h1 = 0 if not first_h1 else len(re.findall(r'\w+', first_h1))
            cells.insert(8, words_h1)
            # Insert url name and path quality
            path = cells[2]
            name_q, path_q = url_quality(path)
            cells.insert(3, path_q / 100)
            cells.insert(3, name_q / 100)
            data.append(cells)
    wb.add_datasheet('New pages', data, incl_txt_fb)

Add a report sheet with data of the new pages.

Arguments:

wb: workbook in which the new worksheet is created
rep_ts: timestamp of the scrape that marks the (exclusive) end of the
    reporting period
ref_ts: timestamp of the scrape that marks the (inclusive) start of the
    reporting period and the (exclusive) end of the reference period
incl_txt_fb: if False, no textual feedback will be included

This sheet lists all relevant available and calculated data for each page that was added since the reference scrape.

#   def url_quality(url: str) -> (<class 'int'>, <class 'int'>):
View Source
def url_quality(url: str) -> (int, int):
    r"""**Assess the quality of the path component of a url.**

    Arguments:

        url: complete url or path component to be assessed

    Returns:

        quality of the name, quality of ancestral path

    Both the name and the complete ancestral path of the url are assessed,
    returning two scores on a scale of 0 to 100.

    An optimal url path consists of lower case characters, numbers and
    hyphens only. A name or ancestral path complying with this, will get a
    score of 100. Any deviation lowers this score by multiplying it with a
    factor that depends on the number and kind of deviations. Four groups of
    deviating characters are distinguished:

    - unsafe characters: blank space and " < > % { } | \ ^ `
    - reserved characters: / ? # ] [ @ ! $ & ' ( ) * + , : ; =
    - suboptimal characters: . _ ~
    - capitals

    Fractions used when encountering one or more of these characters are:

    - unsafe: one 0.50 / more 0.40
    - reserved: one 0.65 / more 0.50
    - suboptimal: one 0.70 / more 0.60
    - capitals: one 0.90 / more 0.80
    """

    def assess_q(part: str) -> int:
        """**Helper function to assess the quality of a part of a url path.**

        Arguments:

            part: url part to be assessed

        Returns:

            quality score on a 0-100 scale
        """
        one_unsafe = 0.50
        more_unsafe = 0.40
        one_reserved = 0.65
        more_reserved = 0.50
        one_subopt = 0.70
        more_subopt = 0.60
        one_caps = 0.90
        more_caps = 0.80
        unsafe_chars = re.compile(r' "<>%{}|\^`')
        reserved_chars = re.compile(r'[:/?#\[\]@!$&\'()*+,;=]')
        subopt_chars = re.compile('[._~]')
        caps = re.compile('[A-Z]')

        q = 100
        unsafe_match = unsafe_chars.findall(part)
        if unsafe_match:
            q *= one_unsafe if len(unsafe_match) == 1 else more_unsafe
        reserved_match = reserved_chars.findall(part)
        if reserved_match:
            q *= one_reserved if len(reserved_match) == 1 else more_reserved
        subopt_match = subopt_chars.findall(part)
        if subopt_match:
            q *= one_subopt if len(subopt_match) == 1 else more_subopt
        caps_match = caps.findall(part)
        if caps_match:
            q *= one_caps if len(caps_match) == 1 else more_caps
        return int(q)

    path = urlparse(url).path
    ancestral_path, name = path.rsplit('/', 1)

    # Assess the quality of the final part of the url
    name_q = assess_q(name)

    # Assess the quality of the ancestral path (path without name)
    ancestors = ancestral_path.split('/')
    if ancestors[0] == '':
        ancestors.pop(0)
    if ancestors[-1] == name:
        # Do not assess the ancestors for a double name at the end
        ancestors.pop()
    anc_string = ''.join(ancestors)
    anc_q = assess_q(anc_string)

    return name_q, anc_q

Assess the quality of the path component of a url.

Arguments:

url: complete url or path component to be assessed

Returns:

quality of the name, quality of ancestral path

Both the name and the complete ancestral path of the url are assessed, returning two scores on a scale of 0 to 100.

An optimal url path consists of lower case characters, numbers and hyphens only. A name or ancestral path complying with this, will get a score of 100. Any deviation lowers this score by multiplying it with a factor that depends on the number and kind of deviations. Four groups of deviating characters are distinguished:

  • unsafe characters: blank space and " < > % { } | \ ^ `
  • reserved characters: / ? # ] [ @ ! $ & ' ( ) * + , : ; =
  • suboptimal characters: . _ ~
  • capitals

Fractions used when encountering one or more of these characters are:

  • unsafe: one 0.50 / more 0.40
  • reserved: one 0.65 / more 0.50
  • suboptimal: one 0.70 / more 0.60
  • capitals: one 0.90 / more 0.80
#   def changed_aspects_sheet(wb: bd_www.report.ReportWorkbook, rep_ts: str, ref_ts: str) -> None:
View Source
def changed_aspects_sheet(wb: ReportWorkbook,
                          rep_ts: str, ref_ts: str) -> None:
    """
    **Add a report sheet with page changes.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period

    The changes are ordered per page and per aspect that changed within the
    reporting period. For each aspect change, the previous and the new value
    are listed, together with the timestamps of the scrapes that registered
    these values. Changes in textual aspects also list a modification factor,
    which is a measure of the difference between the two values. Refer to the
    documentation of the `mod_factor` function for further information
    about this factor.
    """

    qry = 'SELECT name FROM pragma_table_info("his_pages_info")'
    aspects = [row[0] for row in mst_conn.execute(qry).fetchall()
               if row[0] not in ('timestamp', 'page_id')]
    data = []
    for aspect in aspects:
        # Get values and modification factor of changes in aspect
        qry = f'''
            WITH
                -- Timestamps within the reporting period
                period AS (
                    SELECT timestamp
                    FROM mst_scrapes
                    WHERE timestamp > '{ref_ts}'
                        AND timestamp <= '{rep_ts}'
                ),
                aspect_changes AS (
                    SELECT
                        page_id, timestamp,
                        iif(lag({aspect}) OVER win = {aspect}, NULL, {aspect})
                            AS aspect
                    FROM his_pages_info
                    WHERE timestamp <= '{rep_ts}'
                    WINDOW win AS (PARTITION BY page_id ORDER BY timestamp)
                ),
                -- Changes in the relevant aspect before or in period
                aspect_hist AS (
                    SELECT *
                    FROM aspect_changes
                    WHERE aspect NOTNULL
                ),
                -- Aspect changes with previous values
                change_pairs AS (
                    SELECT
                        page_id,
                        lag(timestamp) OVER win AS old_ts,
                        lag(aspect) OVER win AS old_value,
                        timestamp AS new_ts,
                        aspect AS new_value
                    FROM aspect_hist
                    WINDOW win AS (PARTITION BY page_id ORDER BY timestamp)
                )
            SELECT page_id, path, business, language, pagetype,
                '{aspect}' AS aspect,
                new_value, new_ts,
                old_value, old_ts
            FROM change_pairs AS c
            LEFT JOIN mst_paths USING (page_id)
            LEFT JOIN his_pages_info AS i USING (page_id) 
            WHERE old_ts NOTNULL AND new_ts IN period
                AND c.new_ts = i.timestamp
            ORDER BY page_id, old_ts
            '''
        for cells in mst_conn.execute(qry).fetchall():
            cells = list(cells)
            # Calculate and insert modification factor
            if aspect in ('title', 'description', 'first_h1',
                          'ed_text', 'aut_text'):
                new_txt, old_txt = cells[6], cells[8]
                mf = mod_factor(old_txt, new_txt)
            else:
                mf = None
            cells.insert(8, mf)
            data.append(cells)

    # Sort changes on page_id, aspect, timestamp of new value
    data.sort(key=itemgetter(0, 5, 7))
    wb.add_datasheet('Changed aspects', data, shading='cluster')

Add a report sheet with page changes.

Arguments:

wb: workbook in which the new worksheet is created
rep_ts: timestamp of the scrape that marks the (exclusive) end of the
    reporting period
ref_ts: timestamp of the scrape that marks the (inclusive) start of the
    reporting period and the (exclusive) end of the reference period

The changes are ordered per page and per aspect that changed within the reporting period. For each aspect change, the previous and the new value are listed, together with the timestamps of the scrapes that registered these values. Changes in textual aspects also list a modification factor, which is a measure of the difference between the two values. Refer to the documentation of the mod_factor function for further information about this factor.

#   def mod_factor(ref_text: str, act_text: str) -> float:
View Source
def mod_factor(ref_text: str, act_text: str) -> float:
    """
    **Calculate the modification factor of a text string.**

    Arguments:

        ref_text: text acting as reference
        act_text: actual text to compare against the reference

    Returns:

        modification factor in the range of 0 to 1

    The returned value is a measure of the difference between two texts on a
    scale from 0 (texts are exactly equal) to 1 (texts are completely
    different). The value is calculated as `1 - (SR1 + SR2)/2`, where `SR`
    stands for the similarity ratio as defined in the Python standard
    `difflib` module. `SR1` represents the similarity of both texts. `SR2` is
    the similarity of the sorted set of words from both texts. Averaging
    these ratios has the effect that changes in both wording and phrasing are
    distinguished from changes in phrasing or wording only.
    """

    sm = difflib.SequenceMatcher(a=ref_text, b=act_text)
    lib_ratio = sm.ratio()
    sm.set_seq1(a=sorted(list(set(ref_text.split()))))
    sm.set_seq2(b=sorted(list(set(act_text.split()))))
    set_ratio = sm.ratio()
    return 1 - (lib_ratio + set_ratio) / 2

Calculate the modification factor of a text string.

Arguments:

ref_text: text acting as reference
act_text: actual text to compare against the reference

Returns:

modification factor in the range of 0 to 1

The returned value is a measure of the difference between two texts on a scale from 0 (texts are exactly equal) to 1 (texts are completely different). The value is calculated as 1 - (SR1 + SR2)/2, where SR stands for the similarity ratio as defined in the Python standard difflib module. SR1 represents the similarity of both texts. SR2 is the similarity of the sorted set of words from both texts. Averaging these ratios has the effect that changes in both wording and phrasing are distinguished from changes in phrasing or wording only.

#   def all_pages_sheet( wb: bd_www.report.ReportWorkbook, rep_ts: str, ref_ts: str, incl_txt_fb: bool = True ) -> None:
View Source
def all_pages_sheet(wb: ReportWorkbook,
                    rep_ts: str, ref_ts: str, incl_txt_fb: bool = True) -> None:
    """
    **Add a report sheet with data of all pages.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period
        incl_txt_fb: if False, no textual feedback will be included

    This sheet lists the relevant available and calculated data for all pages
    alive at the reporting timestamp `rep_ts`.
    """

    # Set (inclusive) dates for the reporting period
    rep_end = ts_to_d(rep_ts, -1)
    rep_start = ts_to_d(ref_ts) if ref_ts else rep_end

    # Focus on specific scrape data and create temporary views
    with Scrape(rep_ts):
        page_themes()
        per_metrics(rep_start, rep_end)
        per_feedback(rep_start, rep_end)

        qry = '''
            WITH
                scr_title_freq AS (
                    SELECT
                        title,
                        count(*) AS title_freq
                    FROM tsd_pages_info
                    GROUP BY title
                )
            SELECT
                page_id, path,
                title, title_freq, description,
                iif(description ISNULL, 0, length(description)) AS descr_len,
                first_h1,  num_h1s, language, modified, pagetype,
                classes, theme, business, category,
                ifnull(referral_cnt, iif(unl_type = 'orphan', '0!', 0))
                    AS num_ed_refs,
                ed_text, aut_text,
                visits, views, entries, bounces, bounce_rate, exits, exit_rate,
                organic_entries, organic_entry_rate, call_visits, call_rate,
                neg_fb_cnt, pos_fb_cnt,
                ifnull(CAST (neg_fb_cnt + pos_fb_cnt AS REAL) / visits,0)
                    AS fb_rate,
                neg_fb_pct,  pos_fb_pct, neg_txts, pos_txts
            FROM tsd_pages_info
            LEFT JOIN mst_paths USING (page_id)
            LEFT JOIN scr_title_freq USING (title)
            LEFT JOIN temp.themes USING (page_id)
            LEFT JOIN tsd_int_ref_cnt USING (page_id)
            LEFT JOIN tsd_unlinked USING (page_id)
            LEFT JOIN temp.per_metrics USING (page_id)
            LEFT JOIN temp.per_feedback USING (page_id)
            ORDER BY visits DESC
            '''
        data = []
        # Calculate and insert extra data into the query results
        for cells in mst_conn.execute(qry).fetchall():
            cells = list(cells)
            # Reformat or delete textual feedback
            if incl_txt_fb:
                if cells[34]:
                    cells[34] = cell_feedbacks(prep_feedbacks(cells[34]))
                if cells[35]:
                    cells[35] = cell_feedbacks(prep_feedbacks(cells[35]))
            else:
                del cells[34:36]
            # Insert number of words in editorial content
            ed_text = cells[16]
            ed_wrd_cnt = len(re.findall(r'\w+', ed_text))
            cells.insert(17, ed_wrd_cnt)
            # Insert number of words in first h1
            first_h1 = cells[6]
            words_h1 = 0 if not first_h1 else len(re.findall(r'\w+', first_h1))
            cells.insert(7, words_h1)
            # Insert url name and path quality
            path = cells[1]
            name_q, path_q = url_quality(path)
            cells.insert(2, path_q / 100)
            cells.insert(2, name_q / 100)
            data.append(cells)
    wb.add_datasheet('All pages', data, incl_txt_fb)

Add a report sheet with data of all pages.

Arguments:

wb: workbook in which the new worksheet is created
rep_ts: timestamp of the scrape that marks the (exclusive) end of the
    reporting period
ref_ts: timestamp of the scrape that marks the (inclusive) start of the
    reporting period and the (exclusive) end of the reference period
incl_txt_fb: if False, no textual feedback will be included

This sheet lists the relevant available and calculated data for all pages alive at the reporting timestamp rep_ts.

#   def all_feedback_sheet(wb: bd_www.report.ReportWorkbook, rep_ts: str, ref_ts: str) -> None:
View Source
def all_feedback_sheet(wb: ReportWorkbook, rep_ts: str, ref_ts: str) -> None:
    """
    **Add a report sheet with textual feedback of all pages.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period

    This sheet lists available textual feedback for pages alive at the
    reporting timestamp `rep_ts`.
    """

    # Set (inclusive) dates for the reporting period
    rep_end = ts_to_d(rep_ts, -1)
    rep_start = ts_to_d(ref_ts) if ref_ts else rep_end

    # Focus on specific scrape data and create temporary views
    with Scrape(rep_ts):
        per_feedback(rep_start, rep_end)
        qry = '''
            SELECT
                page_id, path, title, neg_txts, pos_txts
            FROM tsd_pages_info
            LEFT JOIN mst_paths USING (page_id)
            LEFT JOIN temp.per_feedback USING (page_id)
            WHERE neg_txts NOTNULL OR pos_txts NOTNULL
            ORDER BY fb_cnt DESC'''
        data = []
        # Form separate rows for each distinct page/feedback combination
        for page_id, path, title, neg_txts, pos_txts in \
                mst_conn.execute(qry).fetchall():
            if neg_txts:
                for txt in prep_feedbacks(neg_txts):
                    data.append((page_id, path, title, 'neg',
                                 int(txt) if txt.isdigit() else txt))
            if pos_txts:
                for txt in prep_feedbacks(pos_txts):
                    data.append((page_id, path, title, 'pos',
                                 int(txt) if txt.isdigit() else txt))
    wb.add_datasheet('All feedback', data, shading='cluster', cluster_col=3)

Add a report sheet with textual feedback of all pages.

Arguments:

wb: workbook in which the new worksheet is created
rep_ts: timestamp of the scrape that marks the (exclusive) end of the
    reporting period
ref_ts: timestamp of the scrape that marks the (inclusive) start of the
    reporting period and the (exclusive) end of the reference period

This sheet lists available textual feedback for pages alive at the reporting timestamp rep_ts.

#   def redirs_aliases_sheet(wb: bd_www.report.ReportWorkbook, rep_ts: str) -> None:
View Source
def redirs_aliases_sheet(
        wb: ReportWorkbook, rep_ts: str) -> None:
    """
    **Add a report sheet with editorial links.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period

    This sheet lists all redirects and alias urls, together with their final
    wcm paths, that were encountered during the scrape with timestamp `rep_ts`.
    """

    with Scrape(rep_ts):
        qry = '''
            SELECT
                redir_type, req_id,
                CASE
                    WHEN req_id IS NULL
                    THEN req_url
                    ELSE req.path
                END AS req,
                redir_id,
                CASE
                    WHEN redir_id IS NULL
                    THEN redir_url
                    ELSE redir.path
                END AS redir
            FROM tsd_redirs AS r
            LEFT JOIN mst_paths AS req ON r.req_id = req.page_id
            LEFT JOIN mst_paths AS redir ON r.redir_id = redir.page_id
            ORDER BY req
            '''
        data = mst_conn.execute(qry).fetchall()
        # Convert redir type to int if appropriate
        data = [(int(t[0]) if t[0].isdigit() else t[0], *t[1:]) for t in data]
    wb.add_datasheet('Redirects and aliases', data)

Add a report sheet with editorial links.

Arguments:

wb: workbook in which the new worksheet is created
rep_ts: timestamp of the scrape that marks the (exclusive) end of the
    reporting period

This sheet lists all redirects and alias urls, together with their final wcm paths, that were encountered during the scrape with timestamp rep_ts.

#   def downloads_sheet(wb: bd_www.report.ReportWorkbook, rep_ts: str, ref_ts: str) -> None:
View Source
def downloads_sheet(wb: ReportWorkbook, rep_ts: str, ref_ts: str) -> None:
    """
    **Add a report sheet with all downloads details.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period

    The sheet contains all downloads that were used during the period,
    along with the download details, metrics and referring pages. Codes in
    the naming of the next classes of downloads are extracted and given in
    the various fields of the sheet.

    - *Documents* - Descriptions are taken from the text anchors of the download
      links. Further details stem from the codes that are part of the filename,
      which are standardised by and used in the GDB document management system,
      that acts as repository for these files. The actual document codes deviate
      in various ways from the standard. The decoding algorithm catches as much
      non-standard codes as possible. Downloads that fail this approach are
      classified as documents with 'malformed code'.
    - *Payroll tax tables* - The algorithm that is used to deduce the
      description from the filename, has been reverse engineered from existing
      downloads. Further details are not available.
    - *Rulings* - Descriptions are taken from the text anchors of the download
      links. The coding of the filenames is broken down to some of the fields,
      but does not provide more information than the descriptions.
    """

    # Create and fill temporary dl_links table
    mst_conn.execute('DROP TABLE IF EXISTS dl_links')
    mst_conn.execute('''
        CREATE TEMPORARY TABLE dl_links (
            link_id	    INTEGER,
            filename	TEXT NOT NULL,
            dl_grp	    TEXT NOT NULL,
            dl_type		TEXT,
            descr		TEXT,
            doc_code	TEXT,
            subgroup	TEXT,
            seq_nr		TEXT,
            funct		INTEGER,
            add_typ		TEXT,
            yr_ver		TEXT,
            form		TEXT,
            ctry_lan	TEXT,
            FOREIGN KEY(link_id) REFERENCES mst_links,
            PRIMARY KEY(link_id)
        )
        ''')
    # Pattern for the well-formed doc code (allowing some deviations)
    patt_wf_doc_code = \
        r'(?P<subgroup>[a-z]{1,5})' \
        r'(?P<seq_nr>\d[a-z\d]{1,3}?[a-z\d]?)' \
        r'(?P<e_doc>[e]?)' \
        r'[-_]?' \
        r'(?P<funct>\d)' \
        r'(?P<add_typ>[beiotz])?' \
        r'(?P<yr_ver>[-\*\d]?\d{1,2})' \
        r'-?(?P<form>ed|fd|fol|hm|ke|odt|pl)' \
        r'-?' \
        r'(?P<ctry_lan>[a-z]{1,})?' \
        r'(?=\.)'
    # Pattern to search a doc code that is malformed or incorrect
    patt_mf_doc_code = \
        r'[-_]' \
        r'(?P<doc_code>' \
        r'(?P<subgroup>[a-z]{1,3})' \
        r'[-_]?' \
        r'\d{1,3}' \
        r'[a-z\d_-]*' \
        r'[a-z]{2}' \
        r')' \
        r'\.'
    # Pattern for payroll tax tables
    patt_pr_table = \
        r'(?P<colour>wit|groen)_' \
        r'(?P<table>kw|mnd|4?wk|dag|bb)_' \
        r'(?P<country>nl|be|sa|lk|dl)?_?' \
        r'(?P<sit_conv>std|ag|bh|ci|dj|ek|fl)_?' \
        r'(?P<liability>alg|svb)?_?' \
        r'(?P<vouchers>vb19min|vb19meer)?_' \
        r'[01]{0,4}' \
        r'(?P<year>\d{4})' \
        r'[01]{0,4}.' \
        r'(?P<format>pdf|xlsx)'
    # Pattern for rulings documents
    patt_ruling = \
        r'rul-' \
        r'(?P<date>(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2}))-' \
        r'(?P<subgroup>.*)-' \
        r'(?P<seq_nr>\d*).pdf'
    # Select download links
    sel_qry = f'''
        SELECT link_id, substr(url, {len(DL_SITE)+1}) AS dl_path
        FROM mst_links
        WHERE url LIKE '{DL_SITE}/%'
        '''
    values = []
    for link_id, dl_path in mst_conn.execute(sel_qry).fetchall():
        filename = Path(dl_path).name.lower()
        dl_grp = Path(dl_path).parts[1]
        descr = doc_code = subgroup = seq_nr = funct = None
        add_typ = yr_ver = form = ctry_lan = None
        if dl_match := re.match(patt_pr_table, filename):
            dl_type = 'Payroll tax table'
            descr = ''
            # <colour>wit|groen
            match dl_match['colour']:
                case 'wit': descr += 'Witte'
                case 'groen': descr += 'Groene'
            # <table>kw|mnd|4?wk|dag|bb
            txt = ''
            match dl_match['table']:
                case 'kw': txt = ' Kwartaal<lp>'
                case 'mnd': txt = ' Maand<lp>'
                case '4wk': txt = ' Vierweken<lp>'
                case 'wk': txt = ' Week<lp>'
                case 'dag': txt = ' Dag<lp>'
                case 'bb': txt = ' Jaar<bb>'
            txt = txt.replace(
                '<lp>', 'loon-tabel loonbelasting/premie volksverzekeringen')
            txt = txt.replace(
                '<bb>', 'loon-tabel bijzondere beloningen')
            descr += txt
            # <vouchers>vb19min|vb19meer
            if dl_match['vouchers']:
                txt = ' met vakantiebonnen voor <nb> vakantiedagen per jaar'
            match dl_match['vouchers']:
                case 'vb19min': descr += txt.replace('<nb>', '19 of minder')
                case 'vb19meer': descr += txt.replace('<nb>', '20 of meer')
            # <country>nl|be|sa|lk|dl
            match dl_match['country']:
                case 'nl': descr += ' Nederland'
                case 'be': descr += ' Belgiƫ'
                case 'sa': descr += ' Suriname of Aruba'
                case 'lk': descr += ' Land van de landenkring'
                case 'dl': descr += ' Derde land'
            # <sit_conv>std|ag|bh|ci|dj|ek|fl
            match dl_match['sit_conv']:
                case 'std': descr += \
                    ', Standaard'
                case 'ag': descr += \
                    ', A - Uitsluitend premieplichtig AOW, ANW en WL'
                case 'bh': descr += \
                    ', B - Uitsluitend belastingplichtig'
                case 'ci': descr += \
                    ', C - Belastingplichtig en premieplichtig AOW en AN'
                case 'dj': descr += \
                    ', D - Belastingplichtig en premieplichtig WL'
                case 'ek': descr += \
                    ', E - Uitsluitend premieplichtig AOW en ANW'
                case 'fl': descr += \
                    ', F - Uitsluitend premieplichtig WL'
            # <liability>alg|svb
            match dl_match['liability']:
                case 'alg': pass
                case 'svb': descr += ', SVB'
            # <year>\d{4}
            if year := dl_match['year']:
                descr += f' (uitgave januari {year})'
        elif dl_match := re.search(patt_wf_doc_code, filename):
            dl_type = 'Document'
            doc_code = dl_match[0]
            subgroup = dl_match['subgroup']
            seq_nr = dl_match['seq_nr']
            funct = dl_match['funct']
            add_typ = dl_match['add_typ']
            yr_ver = dl_match['yr_ver']
            form = dl_match['form']
            ctry_lan = dl_match['ctry_lan']
        elif dl_match := re.search(patt_mf_doc_code, filename):
            dl_type = 'Document (malformed code)'
            doc_code = dl_match['doc_code']
            subgroup = dl_match['subgroup']
        elif dl_match := re.match(patt_ruling, filename):
            dl_type = 'Ruling'
            subgroup = dl_match['subgroup']
            seq_nr = dl_match['seq_nr']
        else:
            dl_type = 'Non-standard download file'

        vals = (link_id, filename, dl_grp, dl_type, descr, doc_code,
                subgroup, seq_nr, funct, add_typ, yr_ver, form, ctry_lan)
        values.append(
            '(' + ', '.join([f'"{v}"' if v else 'NULL' for v in vals]) + ')')
    # Insert the gathered data with one big insert
    ins_qry = f'''
        INSERT INTO dl_links
            (link_id, filename, dl_grp, dl_type, descr, doc_code,
            subgroup, seq_nr, funct, add_typ, yr_ver, form, ctry_lan)
        VALUES
            {', '.join(values)}
        '''
    mst_conn.execute(ins_qry)

    # Set (inclusive) dates for the reporting period
    rep_end = ts_to_d(rep_ts, -1)
    rep_start = ts_to_d(ref_ts) if ref_ts else rep_end

    # Select and gather data to be reported
    qry = f'''
        WITH
            -- Dates in period
            per_days AS (
                SELECT date_id
                FROM dates
                WHERE date >= '{rep_start}' AND date <= '{rep_end}'
            ),
            -- Download metrics in period
            per_downloads AS (
                SELECT DISTINCT
                    link_id,
                    sum(nb_visits) AS uniq_downloads,
                    sum(nb_hits) AS downloads
                FROM downloads
                WHERE date_id IN per_days
                GROUP BY link_id
            ),
            -- Last presence of editorial download links at start of period
            last_presence_dl_edlinks AS (
                SELECT DISTINCT
                    page_id, link_id,
                    last_value(present) OVER win AS present
                FROM his_ed_links
                WHERE link_id IN (SELECT link_id FROM per_downloads)
                    AND timestamp <= '{ref_ts}'
                WINDOW win AS (
                    PARTITION BY page_id, link_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
                )
            ),
            -- Editorial download links in period
            per_dl_edlinks AS (
                -- Editorial download links at start of period
                SELECT DISTINCT page_id, link_id
                FROM last_presence_dl_edlinks
                WHERE present > 0
                    UNION
                -- Editorial download links that became present during period
                SELECT DISTINCT page_id, link_id
                FROM his_ed_links
                WHERE link_id IN (SELECT link_id FROM per_downloads)
                    AND timestamp > '{ref_ts}' AND timestamp <= '{rep_ts}'
                    AND present > 0
            ),
            -- Downloads with all separate pages referring to them
            per_dl_refs_sep AS (
                SELECT DISTINCT link_id, page_id
                FROM per_downloads
                LEFT JOIN per_dl_edlinks USING (link_id)
            ),
            -- Downloads with combined pages referring to them
            per_dl_refs_comb AS (
                SELECT 
                    link_id,
                    group_concat(page_id, ', ') AS ref_page_ids
                FROM per_dl_refs_sep
                GROUP BY link_id
            ),
            -- Last editorial download links before or at end of period
            -- (including links that are no longer present)
            last_dl_edlinks AS (
                SELECT DISTINCT
                    page_id, link_id,
                    last_value(text) OVER win AS text
                FROM his_ed_links
                WHERE link_id IN (SELECT link_id FROM per_downloads)
                    AND timestamp <= '{rep_ts}'
                WINDOW win AS (
                    PARTITION BY page_id, text, link_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
                )
            ),
            -- Timestamps of last pages info before or at end of period
            last_info_ts (page_id, timestamp) AS (
                SELECT DISTINCT page_id, last_value(timestamp) OVER win
                FROM his_pages_info
                WHERE timestamp <= '{rep_ts}'
                WINDOW win AS (
                    PARTITION BY page_id
                    ORDER BY timestamp
                    ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
                )
            ),
            -- Last pages info before or at end of period
            last_pages_info AS (
                SELECT *
                FROM mst_paths
                LEFT JOIN last_info_ts USING (page_id)
                LEFT JOIN his_pages_info USING (page_id, timestamp)
            ),
            -- Last editorial download link details before or at end of period
            last_dl_edlink_details AS (
                SELECT
                    link_id,
                    -- truncate text before first newline
                    iif(instr(text, char(10)),
                        substr(text, 1, instr(text, char(10))-1),
                        text) AS text,
                    replace(url, rtrim(url, replace(url, '/', '')), '') AS file,
                    language
                FROM last_dl_edlinks
                LEFT JOIN mst_links USING (link_id)
                LEFT JOIN last_pages_info USING (page_id)
            )
        SELECT DISTINCT
            link_id, filename, dl_grp, dl_type,
            replace(filename, rtrim(filename, replace(filename, '.', '')), '')
                AS file_type,
            ifnull(descr, first_value(text) OVER win) AS descr,
            doc_code, subgroup, seq_nr, funct, add_typ, yr_ver, form, ctry_lan,
            uniq_downloads, downloads, ref_page_ids
        FROM per_downloads
        LEFT JOIN last_dl_edlink_details USING (link_id)
        LEFT JOIN dl_links USING (link_id)
        LEFT JOIN per_dl_refs_comb USING (link_id)
        WINDOW win AS (
            PARTITION BY link_id
            -- preference for longest dutch text
            ORDER BY language DESC, length(text) DESC
            ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
        )
        ORDER BY downloads DESC
        '''
    data = mst_conn.execute(qry).fetchall()
    wb.add_datasheet('Downloads', data)

Add a report sheet with all downloads details.

Arguments:

wb: workbook in which the new worksheet is created
rep_ts: timestamp of the scrape that marks the (exclusive) end of the
    reporting period
ref_ts: timestamp of the scrape that marks the (inclusive) start of the
    reporting period and the (exclusive) end of the reference period

The sheet contains all downloads that were used during the period, along with the download details, metrics and referring pages. Codes in the naming of the next classes of downloads are extracted and given in the various fields of the sheet.

  • Documents - Descriptions are taken from the text anchors of the download links. Further details stem from the codes that are part of the filename, which are standardised by and used in the GDB document management system, that acts as repository for these files. The actual document codes deviate in various ways from the standard. The decoding algorithm catches as much non-standard codes as possible. Downloads that fail this approach are classified as documents with 'malformed code'.
  • Payroll tax tables - The algorithm that is used to deduce the description from the filename, has been reverse engineered from existing downloads. Further details are not available.
  • Rulings - Descriptions are taken from the text anchors of the download links. The coding of the filenames is broken down to some of the fields, but does not provide more information than the descriptions.
#   def paths_sheet(wb: bd_www.report.ReportWorkbook, rep_ts: str, ref_ts: str) -> None:
View Source
def paths_sheet(wb: ReportWorkbook, rep_ts: str, ref_ts: str) -> None:
    """
    **Add a report sheet with all page paths.**

    Arguments:

        wb: workbook in which the new worksheet is created
        rep_ts: timestamp of the scrape that marks the (exclusive) end of the
            reporting period
        ref_ts: timestamp of the scrape that marks the (inclusive) start of the
            reporting period and the (exclusive) end of the reference period

    This sheet lists the paths of all pages that were alive (at least at some
    moment) in the reporting period.
    """

    # Set (inclusive) dates for the reporting period
    rep_end = ts_to_d(rep_ts, -1)
    rep_start = ts_to_d(ref_ts) if ref_ts else rep_end

    per_pages(rep_start, rep_end)
    qry = f'''
        SELECT page_id, path
        FROM temp.per_pages
        LEFT JOIN mst_paths USING (page_id)
        ORDER BY page_id'''
    data = mst_conn.execute(qry).fetchall()
    wb.add_datasheet('Paths', data)

Add a report sheet with all page paths.

Arguments:

wb: workbook in which the new worksheet is created
rep_ts: timestamp of the scrape that marks the (exclusive) end of the
    reporting period
ref_ts: timestamp of the scrape that marks the (inclusive) start of the
    reporting period and the (exclusive) end of the reference period

This sheet lists the paths of all pages that were alive (at least at some moment) in the reporting period.