Source code for mal_scraper.anime

import itertools
import logging
from datetime import datetime

from bs4 import BeautifulSoup

from .consts import AgeRating, AiringStatus, Format, Retrieved, Season
from .exceptions import MissingTagError, ParseError
from .mal_utils import get_date
from .requester import request_passthrough
from .user_discovery import default_user_store

logger = logging.getLogger(__name__)


[docs]def get_anime(id_ref=1, requester=request_passthrough):
    """Return the information for a particular show.

    You can simply enumerate through id_refs.

    This will raise exceptions unless we properly and fully retrieve and process
    the web-page.

    TODO: Genres https://myanimelist.net/info.php?go=genre
    # Broadcast? Producers? Licensors? Studios? Source? Duration?


    Args:
        id_ref (int, optional): Internal show identifier.
        requester (requests-like, optional): HTTP request maker.
            This allows us to control/limit/mock requests.

    Returns:
        :class:`.Retrieved`: with the attributes `meta` and `data`.

        `data`::

            {
                'name': str,
                'name_english': str,
                'format': mal_scraper.Format,
                'episodes': int, or None when MAL does not know,
                'airing_status': mal_scraper.AiringStatus,
                'airing_started': date, or None when MAL does not know,
                'airing_finished': date, or None when MAL does not know,
                'airing_premiere': tuple(Year (int), Season (mal_scraper.Season))
                    or None (for films, OVAs, specials, ONAs, music, or
                    if MAL does not know),
                'mal_age_rating': mal_scraper.AgeRating,
                'mal_score': float, or None when not yet aired/MAL does not know,
                'mal_scored_by': int (number of people),
                'mal_rank': int, or None when not yet aired/some R rated anime,
                'mal_popularity': int,
                'mal_members': int,
                'mal_favourites': int,
            }

        See also :class:`.Format`, :class:`.AiringStatus`, :class:`.Season`.

    Raises:
        Network and Request Errors: See Requests library.
        .ParseError: Upon processing the web-page including anything that does
            not meet expectations.

    Examples:

        Retrieve the first anime and get the next anime to retrieve::

            next_anime = 1

            try:
                meta, data = mal_scraper.get_anime(next_anime)
            except mal_scraper.ParseError as err:
                logger.error('Investigate page %s with error %d', err.url, err.code)
            except NetworkandRequestErrors:  # Pseudo-code (TODO: These docs)
                pass  # Retry?
            else:
                mycode.save_data(data, when=meta['when'])

            next_anime = meta['id_ref'] + 1
    """
    url = get_url_from_id_ref(id_ref)
    logger.debug('Retrieving anime "%s" from "%s"', id_ref, url)

    response = requester.get(url)
    response.raise_for_status()  # May raise
    # TODO: Raise RequestError if 404

    # Dynamic user discovery
    default_user_store.store_users_from_html(response.text)

    soup = BeautifulSoup(response.content, 'html.parser')
    data = get_anime_from_soup(soup)  # May raise

    meta = {
        'when': datetime.utcnow(),
        'id_ref': id_ref,
        'response': response,
    }

    return Retrieved(meta, data)


def get_url_from_id_ref(id_ref):
    # Use HTTPS to avoid auto-redirect from HTTP (except for tests)
    from .__init__ import _FORCE_HTTP
    protocol = 'http' if _FORCE_HTTP else 'https'
    return '{}://myanimelist.net/anime/{:d}'.format(protocol, id_ref)


def get_anime_from_soup(soup):
    """Return the anime information from a soup of HTML.

    Args:
        soup (Soup): BeautifulSoup object

    Returns:
        A data dictionary::

            {
                'name': str,
                'name_english': str,
                'format': mal_scraper.Format,
                'episodes': int, or None when MAL does not know,
                'airing_status': mal_scraper.AiringStatus,
                'airing_started': date, or None when MAL does not know,
                'airing_finished': date, or None when MAL does not know,
                'airing_premiere': tuple(Year (int), Season (mal_scraper.Season))
                    or None (for films, OVAs, specials, ONAs, music, unknown, or
                    if MAL does not know),
                'mal_age_rating': mal_scraper.AgeRating,
                'mal_score': float, or None when not yet aired/MAL does not know,
                'mal_scored_by': int (number of people),
                'mal_rank': int, or None when not yet aired/some R rated anime,
                'mal_popularity': int,
                'mal_members': int,
                'mal_favourites': int,
            }

    Raises:
        ParseError: If any component of the page could not be processed
            or was unexpected.
    """
    process = [
        ('name', _get_name),
        ('name_english', _get_english_name),
        ('format', _get_format),
        ('episodes', _get_episodes),
        ('airing_status', _get_airing_status),
        ('airing_started', _get_start_date),
        ('airing_finished', _get_end_date),
        ('airing_premiere', _get_airing_premiere),
        ('mal_age_rating', _get_mal_age_rating),
        ('mal_score', _get_mal_score),
        ('mal_scored_by', _get_mal_scored_by),
        ('mal_rank', _get_mal_rank),
        ('mal_popularity', _get_mal_popularity),
        ('mal_members', _get_mal_members),
        ('mal_favourites', _get_mal_favourites),
    ]

    data = {}
    for tag, func in process:
        try:
            result = func(soup, data)
        except ParseError as err:
            logger.debug('Failed to process tag %s', tag)
            err.specify_tag(tag)
            raise

        data[tag] = result

    return data


def _get_name(soup, data=None):
    tag = soup.find('span', itemprop='name')
    if not tag:
        raise MissingTagError('name')

    text = tag.string
    return text


def _get_english_name(soup, data=None):
    pretag = soup.find('span', string='English:')

    # This is not always present (https://myanimelist.net/anime/15)
    if not pretag:
        return ''

    text = pretag.next_sibling.strip()
    return text


def _get_format(soup, data=None):
    pretag = soup.find('span', string='Type:')
    if not pretag:
        raise MissingTagError('type')

    for text in itertools.islice(pretag.next_siblings, 3):
        text = text.string.strip()
        if text:
            break
    else:
        text = None

    format_ = Format.mal_to_enum(text)
    if not format_:  # pragma: no cover
        # Either we missed a format, or MAL changed the webpage
        raise ParseError('Unable to identify format from "{}"'.format(text))

    return format_


def _get_episodes(soup, data=None):
    pretag = soup.find('span', string='Episodes:')
    if not pretag:
        raise MissingTagError('episodes')

    episodes_text = pretag.next_sibling.strip().lower()
    if episodes_text == 'unknown':
        return None

    try:
        episodes_number = int(episodes_text)
    except (ValueError, TypeError):  # pragma: no cover
        # MAL probably changed the webpage
        raise ParseError('Unable to convert text "%s" to int' % episodes_text)

    return episodes_number


def _get_airing_status(soup, data=None):
    pretag = soup.find('span', string='Status:')
    if not pretag:
        raise MissingTagError('status')

    status_text = pretag.next_sibling
    status = AiringStatus.mal_to_enum(status_text)

    if not status:  # pragma: no cover
        # MAL probably changed the website
        raise ParseError('Unable to identify airing status from "%s"' % status_text)

    return status


def _get_start_date(soup, data=None):
    pretag = soup.find('span', string='Aired:')
    if not pretag:
        raise MissingTagError('aired')

    aired_text = pretag.next_sibling.strip().lower()
    if aired_text == 'not available':
        return None

    start_text = aired_text.split(' to ')[0]

    try:
        start_date = get_date(start_text)
    except ValueError:  # pragma: no cover
        # MAL probably changed their website
        raise ParseError('Unable to identify date from "%s"' % start_text)

    return start_date


def _get_end_date(soup, data=None):
    pretag = soup.find('span', string='Aired:')
    if not pretag:
        raise MissingTagError('aired')

    aired_text = pretag.next_sibling.strip()
    date_range_text = aired_text.split(' to ')

    # Not all Aired tags have a date range (https://myanimelist.net/anime/5)
    try:
        end_text = date_range_text[1]
    except IndexError:
        return None

    if end_text == '?':
        return None

    try:
        end_date = get_date(end_text)
    except ValueError:  # pragma: no cover
        # MAL probably changed their website
        raise ParseError('Unable to identify date from "%s"' % end_text)

    return end_date


def _get_airing_premiere(soup, data):
    pretag = soup.find('span', string='Premiered:')
    if not pretag:
        # Film: https://myanimelist.net/anime/5
        # OVA: https://myanimelist.net/anime/44
        # ONA: https://myanimelist.net/anime/574
        # TODO: Missing Special
        # Music: https://myanimelist.net/anime/3642
        # Unknown: https://myanimelist.net/anime/33352
        skip = (Format.film, Format.ova, Format.special, Format.ona, Format.music, Format.unknown)
        if data['format'] in skip:
            return None
        else:
            raise MissingTagError('premiered')

    # '?': https://myanimelist.net/anime/3624
    if pretag.next_sibling.string.strip() == '?':
        return None

    season, year = pretag.find_next('a').string.lower().split(' ')

    season = Season.mal_to_enum(season)
    if season is None:
        # MAL probably changed their website
        raise ParseError('Unable to identify season from "%s"' % season)

    try:
        year = int(year)
    except (ValueError, TypeError):  # pragma: no cover
        # MAL probably changed their website
        raise ParseError('Unable to identify year from "%s"' % year)

    return (year, season)


def _get_mal_age_rating(soup, data=None):
    pretag = soup.find('span', string='Rating:')
    if not pretag:
        raise MissingTagError('Rating')

    full_text = pretag.next_sibling.strip()
    rating_text = full_text.split('(')[0]
    if not rating_text.startswith('R - 17+'):
        rating_text = rating_text.split(' - ')[0]  # A little hacky for PG-13

    rating = AgeRating.mal_to_enum(rating_text)
    if rating is None:
        raise ParseError(
            'Unable to identify age rating from "%s" part of "%s"' % (rating_text, full_text)
        )

    return rating


def _get_mal_score(soup, data):
    pretag = soup.find('span', string='Score:')
    if not pretag:
        raise MissingTagError('Score')

    rating_text = pretag.find_next_sibling('span').string.strip()
    # Not aired yet/MAL does not know anime are excluded
    if rating_text == 'N/A':
        return None

    try:
        return float(rating_text)
    except ValueError:
        raise ParseError('Unable to identify rating from "%s"' % rating_text)


def _get_mal_scored_by(soup, data=None):
    pretag = soup.find('span', string='Score:')
    if not pretag:
        raise MissingTagError('Score')

    count_text = pretag.find_next_siblings('span')[1].string.strip().replace(',', '')
    try:
        return int(count_text)
    except ValueError:
        raise ParseError('Unable to identify #people scoring from "%s"' % count_text)


def _get_mal_rank(soup, data):
    pretag = soup.find('span', string='Ranked:')
    if not pretag:
        raise MissingTagError('Ranked')

    full_text = pretag.next_sibling.strip()
    # Not aired yet and some R+ anime are excluded
    excluded_age_ratings = (
        AgeRating.mal_none, AgeRating.mal_r1, AgeRating.mal_r2, AgeRating.mal_r3
    )
    if (full_text == 'N/A' and
            (data['airing_status'] == AiringStatus.pre_air
             or data['mal_age_rating'] in excluded_age_ratings)):
        return None

    number_value = full_text.replace(',', '').replace('#', '')
    try:
        return int(number_value)
    except ValueError:
        raise ParseError('Unable to identify rank "%s"' % full_text)


def _get_mal_popularity(soup, data=None):
    pretag = soup.find('span', string='Popularity:')
    if not pretag:
        raise MissingTagError('Popularity')

    full_text = pretag.next_sibling.strip()
    number_value = full_text.replace(',', '').replace('#', '')
    try:
        return int(number_value)
    except ValueError:
        raise ParseError('Unable to identify popularity "%s"' % full_text)


def _get_mal_members(soup, data=None):
    pretag = soup.find('span', string='Members:')
    if not pretag:
        raise MissingTagError('Members')

    full_text = pretag.next_sibling.strip()
    number_value = full_text.replace(',', '')
    try:
        return int(number_value)
    except ValueError:
        raise ParseError('Unable to identify #members "%s"' % full_text)


def _get_mal_favourites(soup, data=None):
    pretag = soup.find('span', string='Favorites:')
    if not pretag:
        raise MissingTagError('Favorites')

    full_text = pretag.next_sibling.strip()
    number_value = full_text.replace(',', '')
    try:
        return int(number_value)
    except ValueError:
        raise ParseError('Unable to identify #favourites "%s"' % full_text)