revisit_scrapes

Module for processing scrapes afterwards.

The module scrape_and_report produces a new scrape, processes the scraped data, gets latest web analytics and produces relevant reports for the new scrape. Executing this scrape_and_report module regularly (e.g. by scheduling it on a daily basis) is sufficient for a stable situation.

Still conditions can arise to re-execute some of these steps for existing scrapes. This module provides for these situations by means of the revisit_earlier_scrapes function. It is intended to run manually.

View Source
"""
**Module for processing scrapes afterwards.**

The module `scrape_and_report` produces a new scrape, processes the scraped
data, gets latest web analytics and produces relevant reports for the new
scrape. Executing this `scrape_and_report` module regularly (e.g. by
scheduling it on a daily basis) is sufficient for a stable situation.

Still conditions can arise to re-execute some of these steps for existing
scrapes. This module provides for these situations by means of the
`revisit_earlier_scrapes` function. It is intended to run manually.
"""

import logging
import sys

from bd_www import mst_conn
from bd_www.scrape import reprocess_scrapes
from bd_www.matomo import period_metrics, period_feedback, period_downloads
from bd_www.report import site_report, ts_to_d

# Logging not set up yet
logger = logging.getLogger('revisit')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter(
    fmt='[%(asctime)s] %(levelname)-8s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S')
ch.setFormatter(formatter)
logger.addHandler(ch)


def revisit_earlier_scrapes(
        first_ts: str = '000101-0000',
        last_ts: str = '991231-2359',
        revisit_sources: bool = False,
        reanalyse_edlinks: bool = False,
        renew_analytics: bool = False,
        reproduce_reports: bool = True) -> None:
    """
    **Reproduce data and reports from earlier scrapes.**

    Arguments:

        first_ts: timestamp of first scrape [yymmdd-hhmm]
        last_ts: timestamp of last scrape [yymmdd-hhmm]
        revisit_sources: reanalyse page sources
        reanalyse_edlinks: reanalyse editorial links
        renew_analytics: request new web analytics (overwriting existing data)
        reproduce_reports: produce new reports (overwriting old ones)

    This function is used to update various data related to earlier scrapes.

    When `revisit_sources` is `True`, the `bd_www.scrape.reprocess_scrapes`
    function will be used to renew data in the history tables of the scrapes
    database. It should be applied with care and only in the following
    situations:

    - the *non_scrape_ids* and/or *non_orphan_ids* table of the scrapes
      database was updated (set `first_ts` to a recent scrape)
    - code was changed to extract and/or derive info from page sources
    - code was changed to calculate scrape status figures
    - code was changed to extract and process editorial links
      (set `reanalyse_edlinks` to `True`)

    Since history tables uses time-delta record storage, renewing data will
    always erase all data with later timestamps.
    """

    qry = f'''
        SELECT timestamp
        FROM mst_scrapes
        ORDER BY timestamp
        '''
    all_tss = [row[0] for row in mst_conn.execute(qry).fetchall()]
    # Prime timestamps to revisit with previous to first
    prev_tss = [ts for ts in all_tss if ts < first_ts]
    rev_tss = [prev_tss[-1] if prev_tss else None]
    # Add timestamps of scrapes to revisit
    rev_tss += [ts for ts in all_tss if first_ts <= ts <= last_ts]

    if revisit_sources:
        logger.info(f'started reprocessing {len(rev_tss)} scrapes; '
                    f'this will take some time')
        reprocess_scrapes(rev_tss[1], reanalyse_edlinks, copy_db=False)

    for i in range(0, len(rev_tss)-1):
        prev_ts, curr_ts = rev_tss[i:i + 2]
        day_previous_scrape = ts_to_d(prev_ts) if prev_ts else None
        day_before_current_scrape = ts_to_d(curr_ts, -1)
        logger.info(f'processing scrape {curr_ts}')
        if renew_analytics:
            period_metrics(
                first_date=day_previous_scrape,
                last_date=day_before_current_scrape,
                replace=True)
            period_feedback(
                first_date=day_previous_scrape,
                last_date=day_before_current_scrape)
            period_downloads(
                first_date=day_previous_scrape,
                last_date=day_before_current_scrape)
        if reproduce_reports:
            site_report(curr_ts, log_to_file=False)


if __name__ == '__main__':

    # ======================== INPUT PARAMETERS ========================== #
    first_scrape = '230116-0500'
    last_scrape = '230117-0500'
    renew_pages_info = False
    reanalyse_editorial_links = False
    renew_usage_statistics = True
    reproduce_site_reports = True
    # ==================================================================== #

    last_executed_scrape = mst_conn.execute(
        'SELECT max(timestamp) FROM mst_scrapes').fetchone()[0]
    if last_scrape < last_executed_scrape and renew_pages_info:
        print(f'Last timestamp {last_scrape} is previous to last '
              f'executed scrape {last_executed_scrape}')
        question = 'Do you still want to continue [yes/no]? '
        while (answer := input(question)) not in ('yes', 'no'):
            continue
        if answer == 'no':
            sys.exit()

    revisit_earlier_scrapes(first_scrape, last_scrape, renew_pages_info,
                            reanalyse_editorial_links, renew_usage_statistics,
                            reproduce_site_reports)
#   def revisit_earlier_scrapes( first_ts: str = '000101-0000', last_ts: str = '991231-2359', revisit_sources: bool = False, reanalyse_edlinks: bool = False, renew_analytics: bool = False, reproduce_reports: bool = True ) -> None:
View Source
def revisit_earlier_scrapes(
        first_ts: str = '000101-0000',
        last_ts: str = '991231-2359',
        revisit_sources: bool = False,
        reanalyse_edlinks: bool = False,
        renew_analytics: bool = False,
        reproduce_reports: bool = True) -> None:
    """
    **Reproduce data and reports from earlier scrapes.**

    Arguments:

        first_ts: timestamp of first scrape [yymmdd-hhmm]
        last_ts: timestamp of last scrape [yymmdd-hhmm]
        revisit_sources: reanalyse page sources
        reanalyse_edlinks: reanalyse editorial links
        renew_analytics: request new web analytics (overwriting existing data)
        reproduce_reports: produce new reports (overwriting old ones)

    This function is used to update various data related to earlier scrapes.

    When `revisit_sources` is `True`, the `bd_www.scrape.reprocess_scrapes`
    function will be used to renew data in the history tables of the scrapes
    database. It should be applied with care and only in the following
    situations:

    - the *non_scrape_ids* and/or *non_orphan_ids* table of the scrapes
      database was updated (set `first_ts` to a recent scrape)
    - code was changed to extract and/or derive info from page sources
    - code was changed to calculate scrape status figures
    - code was changed to extract and process editorial links
      (set `reanalyse_edlinks` to `True`)

    Since history tables uses time-delta record storage, renewing data will
    always erase all data with later timestamps.
    """

    qry = f'''
        SELECT timestamp
        FROM mst_scrapes
        ORDER BY timestamp
        '''
    all_tss = [row[0] for row in mst_conn.execute(qry).fetchall()]
    # Prime timestamps to revisit with previous to first
    prev_tss = [ts for ts in all_tss if ts < first_ts]
    rev_tss = [prev_tss[-1] if prev_tss else None]
    # Add timestamps of scrapes to revisit
    rev_tss += [ts for ts in all_tss if first_ts <= ts <= last_ts]

    if revisit_sources:
        logger.info(f'started reprocessing {len(rev_tss)} scrapes; '
                    f'this will take some time')
        reprocess_scrapes(rev_tss[1], reanalyse_edlinks, copy_db=False)

    for i in range(0, len(rev_tss)-1):
        prev_ts, curr_ts = rev_tss[i:i + 2]
        day_previous_scrape = ts_to_d(prev_ts) if prev_ts else None
        day_before_current_scrape = ts_to_d(curr_ts, -1)
        logger.info(f'processing scrape {curr_ts}')
        if renew_analytics:
            period_metrics(
                first_date=day_previous_scrape,
                last_date=day_before_current_scrape,
                replace=True)
            period_feedback(
                first_date=day_previous_scrape,
                last_date=day_before_current_scrape)
            period_downloads(
                first_date=day_previous_scrape,
                last_date=day_before_current_scrape)
        if reproduce_reports:
            site_report(curr_ts, log_to_file=False)

Reproduce data and reports from earlier scrapes.

Arguments:

first_ts: timestamp of first scrape [yymmdd-hhmm]
last_ts: timestamp of last scrape [yymmdd-hhmm]
revisit_sources: reanalyse page sources
reanalyse_edlinks: reanalyse editorial links
renew_analytics: request new web analytics (overwriting existing data)
reproduce_reports: produce new reports (overwriting old ones)

This function is used to update various data related to earlier scrapes.

When revisit_sources is True, the bd_www.scrape.reprocess_scrapes function will be used to renew data in the history tables of the scrapes database. It should be applied with care and only in the following situations:

  • the non_scrape_ids and/or non_orphan_ids table of the scrapes database was updated (set first_ts to a recent scrape)
  • code was changed to extract and/or derive info from page sources
  • code was changed to calculate scrape status figures
  • code was changed to extract and process editorial links (set reanalyse_edlinks to True)

Since history tables uses time-delta record storage, renewing data will always erase all data with later timestamps.