Source code for mal_scraper.user_discovery

"""Discover user_ids (automatically)."""

import logging
import re

from .requester import request_passthrough

logger = logging.getLogger(__name__)


[docs]def discover_users(requester=request_passthrough, use_cache=True, use_web=None):
    """Return a set of user_ids usable by other user related library calls.

    By default we will attempt to return any in our cache - clearing the cache
    in the process. If there are no users in the cache, we will attempt to
    find some on MAL but these will be biased towards recently active users.

    The cache is built up by discovering users from all of the other web-pages
    retrieved from other API calls as you make those calls.

    Args:
        requester (requests-like, optional): HTTP request maker.
            This allows us to control/limit/mock requests.
        use_cache (bool, optional): Ignore the cache that we have built up over time?
            True (default): Pretend the cache is empty (and do not clear it).
            False: Get and clear the cache.
        use_web (bool, optional): Control whether to fall back to scraping.
            None (default) to make a network call only if the cache is empty.
            False to never make a network call.
            True to always make a network call.

    Returns:
        A set of user_ids which are strings.

    Raises:
        Network and Request Errors: See Requests library.

    Examples:

        Get user_ids discovered from earlier uses of the library::

            animes = mal_scraper.get_anime()
            users_probably_from_cache = mal_scraper.discover_users()

        Get user_ids if there are any in the cache, but don't bother to make
        a network call just to find some::

            users_from_cache = mal_scraper.discover_users(use_web=False)

        Discover some users from the web, ignoring the cache::

            users_from_web = mal_scraper.discover_users(use_cache=False)
    """
    # TODO: Dependency injection for user store
    # TODO: Test this method
    discovered_users = set()

    if use_cache:
        discovered_users |= default_user_store.get_and_clear_cache()

    # Force use web, or fall-back to web if the cache is empty
    if use_web or (use_web is None and not discovered_users):
        response = requester.get(get_url_for_user_discovery())
        response.raise_for_status()  # May raise
        discovered_users |= set(discover_users_from_html(response.text))

    return discovered_users


def get_url_for_user_discovery():
    """Return the URL to the profile discovery page."""
    # Use HTTPS to avoid auto-redirect from HTTP (except for tests)
    from .__init__ import _FORCE_HTTP  # noqa
    protocol = 'http' if _FORCE_HTTP else 'https'
    return '{}://myanimelist.net/users.php'.format(protocol)


_username_regex = re.compile(
    r"href=[\"'](https?\://myanimelist\.net)?/profile/(?P<username>\w+)[\w/]*[\"']",
    re.ASCII | re.DOTALL | re.IGNORECASE,
)


def discover_users_from_html(html):
    """Generate usernames from the given HTML (usernames may be duplicated)

    Args:
        html (str): HTML to hunt through

    Yields:
        user_id (string)

    Test strings::

        <a href="/profile/TheLlama">
        <a href="https://myanimelist.net/profile/TheLlama">
        <a href="/profile/TheLlama/reviews">All reviews</a>
    """
    return (m.group('username') for m in _username_regex.finditer(html))


class UserStore:
    """Cache the dynamic discovery of users."""

    def __init__(self):
        self.cache = set()

    def store_users_from_html(self, html):
        """Store the users discovered in the cache from the given HTML text."""
        self.cache |= set(discover_users_from_html(html))

    def get_and_clear_cache(self):
        cache, self.cache = self.cache, set()
        return cache


default_user_store = UserStore()