Source code for mal_scraper.anime

import itertools
import logging
from datetime import datetime

from bs4 import BeautifulSoup

from .consts import AgeRating, AiringStatus, Format, Retrieved, Season
from .exceptions import MissingTagError, ParseError
from .mal_utils import get_date
from .requester import request_passthrough
from .user_discovery import default_user_store

logger = logging.getLogger(__name__)


[docs]def get_anime(id_ref=1, requester=request_passthrough): """Return the information for a particular show. You can simply enumerate through id_refs. This will raise exceptions unless we properly and fully retrieve and process the web-page. TODO: Genres https://myanimelist.net/info.php?go=genre # Broadcast? Producers? Licensors? Studios? Source? Duration? Args: id_ref (int, optional): Internal show identifier. requester (requests-like, optional): HTTP request maker. This allows us to control/limit/mock requests. Returns: :class:`.Retrieved`: with the attributes `meta` and `data`. `data`:: { 'name': str, 'name_english': str, 'format': mal_scraper.Format, 'episodes': int, or None when MAL does not know, 'airing_status': mal_scraper.AiringStatus, 'airing_started': date, or None when MAL does not know, 'airing_finished': date, or None when MAL does not know, 'airing_premiere': tuple(Year (int), Season (mal_scraper.Season)) or None (for films, OVAs, specials, ONAs, music, or if MAL does not know), 'mal_age_rating': mal_scraper.AgeRating, 'mal_score': float, or None when not yet aired/MAL does not know, 'mal_scored_by': int (number of people), 'mal_rank': int, or None when not yet aired/some R rated anime, 'mal_popularity': int, 'mal_members': int, 'mal_favourites': int, } See also :class:`.Format`, :class:`.AiringStatus`, :class:`.Season`. Raises: Network and Request Errors: See Requests library. .ParseError: Upon processing the web-page including anything that does not meet expectations. Examples: Retrieve the first anime and get the next anime to retrieve:: next_anime = 1 try: meta, data = mal_scraper.get_anime(next_anime) except mal_scraper.ParseError as err: logger.error('Investigate page %s with error %d', err.url, err.code) except NetworkandRequestErrors: # Pseudo-code (TODO: These docs) pass # Retry? else: mycode.save_data(data, when=meta['when']) next_anime = meta['id_ref'] + 1 """ url = get_url_from_id_ref(id_ref) logger.debug('Retrieving anime "%s" from "%s"', id_ref, url) response = requester.get(url) response.raise_for_status() # May raise # TODO: Raise RequestError if 404 # Dynamic user discovery default_user_store.store_users_from_html(response.text) soup = BeautifulSoup(response.content, 'html.parser') data = get_anime_from_soup(soup) # May raise meta = { 'when': datetime.utcnow(), 'id_ref': id_ref, 'response': response, } return Retrieved(meta, data)
def get_url_from_id_ref(id_ref): # Use HTTPS to avoid auto-redirect from HTTP (except for tests) from .__init__ import _FORCE_HTTP protocol = 'http' if _FORCE_HTTP else 'https' return '{}://myanimelist.net/anime/{:d}'.format(protocol, id_ref) def get_anime_from_soup(soup): """Return the anime information from a soup of HTML. Args: soup (Soup): BeautifulSoup object Returns: A data dictionary:: { 'name': str, 'name_english': str, 'format': mal_scraper.Format, 'episodes': int, or None when MAL does not know, 'airing_status': mal_scraper.AiringStatus, 'airing_started': date, or None when MAL does not know, 'airing_finished': date, or None when MAL does not know, 'airing_premiere': tuple(Year (int), Season (mal_scraper.Season)) or None (for films, OVAs, specials, ONAs, music, unknown, or if MAL does not know), 'mal_age_rating': mal_scraper.AgeRating, 'mal_score': float, or None when not yet aired/MAL does not know, 'mal_scored_by': int (number of people), 'mal_rank': int, or None when not yet aired/some R rated anime, 'mal_popularity': int, 'mal_members': int, 'mal_favourites': int, } Raises: ParseError: If any component of the page could not be processed or was unexpected. """ process = [ ('name', _get_name), ('name_english', _get_english_name), ('format', _get_format), ('episodes', _get_episodes), ('airing_status', _get_airing_status), ('airing_started', _get_start_date), ('airing_finished', _get_end_date), ('airing_premiere', _get_airing_premiere), ('mal_age_rating', _get_mal_age_rating), ('mal_score', _get_mal_score), ('mal_scored_by', _get_mal_scored_by), ('mal_rank', _get_mal_rank), ('mal_popularity', _get_mal_popularity), ('mal_members', _get_mal_members), ('mal_favourites', _get_mal_favourites), ] data = {} for tag, func in process: try: result = func(soup, data) except ParseError as err: logger.debug('Failed to process tag %s', tag) err.specify_tag(tag) raise data[tag] = result return data def _get_name(soup, data=None): tag = soup.find('span', itemprop='name') if not tag: raise MissingTagError('name') text = tag.string return text def _get_english_name(soup, data=None): pretag = soup.find('span', string='English:') # This is not always present (https://myanimelist.net/anime/15) if not pretag: return '' text = pretag.next_sibling.strip() return text def _get_format(soup, data=None): pretag = soup.find('span', string='Type:') if not pretag: raise MissingTagError('type') for text in itertools.islice(pretag.next_siblings, 3): text = text.string.strip() if text: break else: text = None format_ = Format.mal_to_enum(text) if not format_: # pragma: no cover # Either we missed a format, or MAL changed the webpage raise ParseError('Unable to identify format from "{}"'.format(text)) return format_ def _get_episodes(soup, data=None): pretag = soup.find('span', string='Episodes:') if not pretag: raise MissingTagError('episodes') episodes_text = pretag.next_sibling.strip().lower() if episodes_text == 'unknown': return None try: episodes_number = int(episodes_text) except (ValueError, TypeError): # pragma: no cover # MAL probably changed the webpage raise ParseError('Unable to convert text "%s" to int' % episodes_text) return episodes_number def _get_airing_status(soup, data=None): pretag = soup.find('span', string='Status:') if not pretag: raise MissingTagError('status') status_text = pretag.next_sibling status = AiringStatus.mal_to_enum(status_text) if not status: # pragma: no cover # MAL probably changed the website raise ParseError('Unable to identify airing status from "%s"' % status_text) return status def _get_start_date(soup, data=None): pretag = soup.find('span', string='Aired:') if not pretag: raise MissingTagError('aired') aired_text = pretag.next_sibling.strip().lower() if aired_text == 'not available': return None start_text = aired_text.split(' to ')[0] try: start_date = get_date(start_text) except ValueError: # pragma: no cover # MAL probably changed their website raise ParseError('Unable to identify date from "%s"' % start_text) return start_date def _get_end_date(soup, data=None): pretag = soup.find('span', string='Aired:') if not pretag: raise MissingTagError('aired') aired_text = pretag.next_sibling.strip() date_range_text = aired_text.split(' to ') # Not all Aired tags have a date range (https://myanimelist.net/anime/5) try: end_text = date_range_text[1] except IndexError: return None if end_text == '?': return None try: end_date = get_date(end_text) except ValueError: # pragma: no cover # MAL probably changed their website raise ParseError('Unable to identify date from "%s"' % end_text) return end_date def _get_airing_premiere(soup, data): pretag = soup.find('span', string='Premiered:') if not pretag: # Film: https://myanimelist.net/anime/5 # OVA: https://myanimelist.net/anime/44 # ONA: https://myanimelist.net/anime/574 # TODO: Missing Special # Music: https://myanimelist.net/anime/3642 # Unknown: https://myanimelist.net/anime/33352 skip = (Format.film, Format.ova, Format.special, Format.ona, Format.music, Format.unknown) if data['format'] in skip: return None else: raise MissingTagError('premiered') # '?': https://myanimelist.net/anime/3624 if pretag.next_sibling.string.strip() == '?': return None season, year = pretag.find_next('a').string.lower().split(' ') season = Season.mal_to_enum(season) if season is None: # MAL probably changed their website raise ParseError('Unable to identify season from "%s"' % season) try: year = int(year) except (ValueError, TypeError): # pragma: no cover # MAL probably changed their website raise ParseError('Unable to identify year from "%s"' % year) return (year, season) def _get_mal_age_rating(soup, data=None): pretag = soup.find('span', string='Rating:') if not pretag: raise MissingTagError('Rating') full_text = pretag.next_sibling.strip() rating_text = full_text.split('(')[0] if not rating_text.startswith('R - 17+'): rating_text = rating_text.split(' - ')[0] # A little hacky for PG-13 rating = AgeRating.mal_to_enum(rating_text) if rating is None: raise ParseError( 'Unable to identify age rating from "%s" part of "%s"' % (rating_text, full_text) ) return rating def _get_mal_score(soup, data): pretag = soup.find('span', string='Score:') if not pretag: raise MissingTagError('Score') rating_text = pretag.find_next_sibling('span').string.strip() # Not aired yet/MAL does not know anime are excluded if rating_text == 'N/A': return None try: return float(rating_text) except ValueError: raise ParseError('Unable to identify rating from "%s"' % rating_text) def _get_mal_scored_by(soup, data=None): pretag = soup.find('span', string='Score:') if not pretag: raise MissingTagError('Score') count_text = pretag.find_next_siblings('span')[1].string.strip().replace(',', '') try: return int(count_text) except ValueError: raise ParseError('Unable to identify #people scoring from "%s"' % count_text) def _get_mal_rank(soup, data): pretag = soup.find('span', string='Ranked:') if not pretag: raise MissingTagError('Ranked') full_text = pretag.next_sibling.strip() # Not aired yet and some R+ anime are excluded excluded_age_ratings = ( AgeRating.mal_none, AgeRating.mal_r1, AgeRating.mal_r2, AgeRating.mal_r3 ) if (full_text == 'N/A' and (data['airing_status'] == AiringStatus.pre_air or data['mal_age_rating'] in excluded_age_ratings)): return None number_value = full_text.replace(',', '').replace('#', '') try: return int(number_value) except ValueError: raise ParseError('Unable to identify rank "%s"' % full_text) def _get_mal_popularity(soup, data=None): pretag = soup.find('span', string='Popularity:') if not pretag: raise MissingTagError('Popularity') full_text = pretag.next_sibling.strip() number_value = full_text.replace(',', '').replace('#', '') try: return int(number_value) except ValueError: raise ParseError('Unable to identify popularity "%s"' % full_text) def _get_mal_members(soup, data=None): pretag = soup.find('span', string='Members:') if not pretag: raise MissingTagError('Members') full_text = pretag.next_sibling.strip() number_value = full_text.replace(',', '') try: return int(number_value) except ValueError: raise ParseError('Unable to identify #members "%s"' % full_text) def _get_mal_favourites(soup, data=None): pretag = soup.find('span', string='Favorites:') if not pretag: raise MissingTagError('Favorites') full_text = pretag.next_sibling.strip() number_value = full_text.replace(',', '') try: return int(number_value) except ValueError: raise ParseError('Unable to identify #favourites "%s"' % full_text)