Source code for getsitemap.retrieve_sitemap

import concurrent.futures
import urllib.robotparser as rp
from typing import Dict, Union

import requests
from bs4 import BeautifulSoup


def _concurrent_thread_starter(urls: list, thread_max: int, allow_xml_inference: bool, dedupe_results: bool):
    """
    Create a pool of threads to retrieve sitemap files.

    :param urls: A list of URLs to retrieve.
    :type urls: list
    :param thread_max: The maximum number of threads to use in sitemap retrieval requests.
    :type thread_max: int
    :param allow_xml_inference: Whether or not to infer that a URL ending in .xml is a sitemap.
    :type allow_xml_inference: bool
    :param dedupe_results: Whether or not to deduplicate the results.
    :type dedupe_results: bool
    :return: A dictionary of URLs found in each discovered sitemap.
    :rtype: dict
    """
    results = {}

    with concurrent.futures.ThreadPoolExecutor(max_workers=thread_max) as executor:
        processes = [executor.submit(get_individual_sitemap, url, dedupe_results, allow_xml_inference) for url in urls]

        for process in concurrent.futures.as_completed(processes):
            results.update(process.result())

    return results


def _parse_list_of_urls(
    parsed_file: BeautifulSoup,
    root_url: str,
    allow_xml_inference: bool = True,
    recurse: bool = True,
    dedupe_results: bool = True,
) -> dict:
    """
    Get all the URLs in a non-sitemapindex sitemap.

    :param parsed_file: The parsed sitemap file.
    :type parsed_file: BeautifulSoup
    :param root_url: The URL of the sitemap.
    :type root_url: str
    :param allow_xml_inference: Whether or not to infer that a URL ending in .xml is a sitemap.
    :type allow_xml_inference: bool
    :return: A dictionary of URLs found in each discovered sitemap.
    :rtype: dict
    """
    all_urls = {}

    for url in parsed_file.find_all("url"):
        if not url.find("loc") or not url.find("loc").text:
            continue
        if url.find("loc") and url.find("loc").text.endswith(".xml") and allow_xml_inference:
            if recurse is False:
                all_urls.update(
                    get_individual_sitemap(url.find("loc").text.strip(), allow_xml_inference, dedupe_results)
                )
        elif url.find("loc"):
            if all_urls.get(root_url):
                all_urls[root_url].append(url.find("loc").text.strip())
            else:
                all_urls[root_url] = [url.find("loc").text.strip()]

    return all_urls


[docs]def get_individual_sitemap(
    root_url: str,
    thread_max: int = 20,
    dedupe_results: bool = True,
    allow_xml_inference: bool = True,
    recurse: bool = False,
) -> dict:
    """
    Get all of the URLs associated with a single sitemap.

    :param root_url: The URL of the sitemap.
    :type root_url: str
    :param thread_max: The maximum number of threads to use in sitemap retrieval requests.
    :type thread_max: int
    :param allow_xml_inference: Whether or not to infer that a URL ending in .xml is a sitemap.
    :type allow_xml_inference: bool
    :param recurse: Whether or not to recurse into other sitemaps.
    :type recurse: bool
    :return: A dictionary of URLs found in each discovered sitemap.
    :rtype: dict
    
    Example:
    
    .. code-block:: python
    
        import getsitemap
        
        urls = getsitemap.get_individual_sitemap("https://jamesg.blog/sitemap.xml")
        
        print(urls) # ["https://jamesg.blog/2020/09/01/my-experience-with-jekyll/", ...]
    """

    all_urls = {}

    try:
        sitemap_file = requests.get(root_url, timeout=10)
    except requests.exceptions.RequestException:
        return {}

    if sitemap_file.status_code != 200:
        return {}

    parsed_file = BeautifulSoup(sitemap_file.text, "xml")

    if parsed_file.find("sitemapindex"):
        # find all the urls in the sitemap index
        all_sitemaps = parsed_file.find_all("sitemap")

        sitemap_urls = list(set([sitemap.find("loc").text for sitemap in all_sitemaps if sitemap.find("loc")]))

        if recurse:
            all_urls.update(_concurrent_thread_starter(sitemap_urls, thread_max, allow_xml_inference, dedupe_results))
        else:
            return {root_url: sitemap_urls}
    else:
        all_urls.update(_parse_list_of_urls(parsed_file, root_url, allow_xml_inference))

    if dedupe_results:
        for key, value in all_urls.items():
            # remove duplicates
            all_urls[key] = list(set(value))

    return all_urls


def _flatten_sitemap_dictionaries(all_discovered_sitemaps: dict) -> dict:
    """
    Flatten a dictionary of sitemaps into a flat list.

    :param all_discovered_sitemaps: A dictionary of sitemaps.
    :type all_discovered_sitemaps: dict
    :return: A flat list of URLs.
    :rtype: list
    """
    flat_sitemaps: Dict[str, list] = {}
    for key, url in all_discovered_sitemaps.items():
        if isinstance(url, dict):
            for key, value in url.items():
                if flat_sitemaps.get(key):
                    flat_sitemaps[key].extend(value)
                else:
                    flat_sitemaps[key] = value
        else:
            flat_sitemaps[key] = url

    return flat_sitemaps


[docs]def retrieve_sitemap_urls(
    root_page: str,
    as_flat_list: bool = True,
    allow_xml_inference: bool = True,
    thread_max: int = 20,
    dedupe_results: bool = True,
) -> Union[list, dict]:
    """
    Find all of the URLs in every sitemap associated with a provided domain.

    This function will take a bit of time to run depending on how many URLs are discovered.

    :param root_page: The root page of the domain to search for sitemaps.
    :type root_page: str
    :param as_flat_list: Whether or not to return the URLs as a flat list.
    :type as_flat_list: bool
    :param allow_xml_inference: Whether or not to infer that a URL ending in .xml is a sitemap.
    :type allow_xml_inference: bool
    :param thread_max: The maximum number of threads to use in sitemap retrieval requests.
    :type thread_max: int
    :param dedupe_results: Whether or not to remove duplicate URLs.
    :type dedupe_results: bool
    :return: A list of URLs.
    :rtype: Union[list, dict]

    Example:

    .. code-block:: python

        import getsitemap

        all_urls = getsitemap.retrieve_sitemap_urls("https://www.example.com")

        print(all_urls) # ["https://www.example.com", "https://www.example.com/about", ...]
    """
    all_discovered_urls = {}

    parser = rp.RobotFileParser()
    parser.set_url(root_page.rstrip("/") + "/robots.txt")
    parser.read()

    sitemap_urls = parser.site_maps()

    if sitemap_urls:
        if root_page + "/sitemap.xml" not in sitemap_urls:
            sitemap_urls.append(root_page + "/sitemap.xml")

        unique_sitemaps = list(set(sitemap_urls))

        new_urls = _concurrent_thread_starter(unique_sitemaps, thread_max, allow_xml_inference, dedupe_results)

        all_discovered_urls.update(new_urls)

    if as_flat_list:
        return [url for url_list in all_discovered_urls.values() for url in url_list]
    else:
        return _flatten_sitemap_dictionaries(all_discovered_urls)
Source code for getsitemap.retrieve_sitemap

getsitemap

Navigation

Related Topics