Source code for getsitemap.retrieve_sitemap
import concurrent.futures
import urllib.robotparser as rp
from typing import Dict, Union
import requests
from bs4 import BeautifulSoup
def _concurrent_thread_starter(urls: list, thread_max: int, allow_xml_inference: bool, dedupe_results: bool):
"""
Create a pool of threads to retrieve sitemap files.
:param urls: A list of URLs to retrieve.
:type urls: list
:param thread_max: The maximum number of threads to use in sitemap retrieval requests.
:type thread_max: int
:param allow_xml_inference: Whether or not to infer that a URL ending in .xml is a sitemap.
:type allow_xml_inference: bool
:param dedupe_results: Whether or not to deduplicate the results.
:type dedupe_results: bool
:return: A dictionary of URLs found in each discovered sitemap.
:rtype: dict
"""
results = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=thread_max) as executor:
processes = [executor.submit(get_individual_sitemap, url, dedupe_results, allow_xml_inference) for url in urls]
for process in concurrent.futures.as_completed(processes):
results.update(process.result())
return results
def _parse_list_of_urls(
parsed_file: BeautifulSoup,
root_url: str,
allow_xml_inference: bool = True,
recurse: bool = True,
dedupe_results: bool = True,
) -> dict:
"""
Get all the URLs in a non-sitemapindex sitemap.
:param parsed_file: The parsed sitemap file.
:type parsed_file: BeautifulSoup
:param root_url: The URL of the sitemap.
:type root_url: str
:param allow_xml_inference: Whether or not to infer that a URL ending in .xml is a sitemap.
:type allow_xml_inference: bool
:return: A dictionary of URLs found in each discovered sitemap.
:rtype: dict
"""
all_urls = {}
for url in parsed_file.find_all("url"):
if not url.find("loc") or not url.find("loc").text:
continue
if url.find("loc") and url.find("loc").text.endswith(".xml") and allow_xml_inference:
if recurse is False:
all_urls.update(
get_individual_sitemap(url.find("loc").text.strip(), allow_xml_inference, dedupe_results)
)
elif url.find("loc"):
if all_urls.get(root_url):
all_urls[root_url].append(url.find("loc").text.strip())
else:
all_urls[root_url] = [url.find("loc").text.strip()]
return all_urls
[docs]def get_individual_sitemap(
root_url: str,
thread_max: int = 20,
dedupe_results: bool = True,
allow_xml_inference: bool = True,
recurse: bool = False,
) -> dict:
"""
Get all of the URLs associated with a single sitemap.
:param root_url: The URL of the sitemap.
:type root_url: str
:param thread_max: The maximum number of threads to use in sitemap retrieval requests.
:type thread_max: int
:param allow_xml_inference: Whether or not to infer that a URL ending in .xml is a sitemap.
:type allow_xml_inference: bool
:param recurse: Whether or not to recurse into other sitemaps.
:type recurse: bool
:return: A dictionary of URLs found in each discovered sitemap.
:rtype: dict
Example:
.. code-block:: python
import getsitemap
urls = getsitemap.get_individual_sitemap("https://jamesg.blog/sitemap.xml")
print(urls) # ["https://jamesg.blog/2020/09/01/my-experience-with-jekyll/", ...]
"""
all_urls = {}
try:
sitemap_file = requests.get(root_url, timeout=10)
except requests.exceptions.RequestException:
return {}
if sitemap_file.status_code != 200:
return {}
parsed_file = BeautifulSoup(sitemap_file.text, "xml")
if parsed_file.find("sitemapindex"):
# find all the urls in the sitemap index
all_sitemaps = parsed_file.find_all("sitemap")
sitemap_urls = list(set([sitemap.find("loc").text for sitemap in all_sitemaps if sitemap.find("loc")]))
if recurse:
all_urls.update(_concurrent_thread_starter(sitemap_urls, thread_max, allow_xml_inference, dedupe_results))
else:
return {root_url: sitemap_urls}
else:
all_urls.update(_parse_list_of_urls(parsed_file, root_url, allow_xml_inference))
if dedupe_results:
for key, value in all_urls.items():
# remove duplicates
all_urls[key] = list(set(value))
return all_urls
def _flatten_sitemap_dictionaries(all_discovered_sitemaps: dict) -> dict:
"""
Flatten a dictionary of sitemaps into a flat list.
:param all_discovered_sitemaps: A dictionary of sitemaps.
:type all_discovered_sitemaps: dict
:return: A flat list of URLs.
:rtype: list
"""
flat_sitemaps: Dict[str, list] = {}
for key, url in all_discovered_sitemaps.items():
if isinstance(url, dict):
for key, value in url.items():
if flat_sitemaps.get(key):
flat_sitemaps[key].extend(value)
else:
flat_sitemaps[key] = value
else:
flat_sitemaps[key] = url
return flat_sitemaps
[docs]def retrieve_sitemap_urls(
root_page: str,
as_flat_list: bool = True,
allow_xml_inference: bool = True,
thread_max: int = 20,
dedupe_results: bool = True,
) -> Union[list, dict]:
"""
Find all of the URLs in every sitemap associated with a provided domain.
This function will take a bit of time to run depending on how many URLs are discovered.
:param root_page: The root page of the domain to search for sitemaps.
:type root_page: str
:param as_flat_list: Whether or not to return the URLs as a flat list.
:type as_flat_list: bool
:param allow_xml_inference: Whether or not to infer that a URL ending in .xml is a sitemap.
:type allow_xml_inference: bool
:param thread_max: The maximum number of threads to use in sitemap retrieval requests.
:type thread_max: int
:param dedupe_results: Whether or not to remove duplicate URLs.
:type dedupe_results: bool
:return: A list of URLs.
:rtype: Union[list, dict]
Example:
.. code-block:: python
import getsitemap
all_urls = getsitemap.retrieve_sitemap_urls("https://www.example.com")
print(all_urls) # ["https://www.example.com", "https://www.example.com/about", ...]
"""
all_discovered_urls = {}
parser = rp.RobotFileParser()
parser.set_url(root_page.rstrip("/") + "/robots.txt")
parser.read()
sitemap_urls = parser.site_maps()
if sitemap_urls:
if root_page + "/sitemap.xml" not in sitemap_urls:
sitemap_urls.append(root_page + "/sitemap.xml")
unique_sitemaps = list(set(sitemap_urls))
new_urls = _concurrent_thread_starter(unique_sitemaps, thread_max, allow_xml_inference, dedupe_results)
all_discovered_urls.update(new_urls)
if as_flat_list:
return [url for url_list in all_discovered_urls.values() for url in url_list]
else:
return _flatten_sitemap_dictionaries(all_discovered_urls)