How to load a gziped XML Index-Sitemap into Pandas using BeautifulSoup


In some cases you want to do more than just crawl XML Sitemaps with e.g. Screaming Frog oder Deepcrawl.


  • replace “beta” URLs with “www” URLs
  • filter for time ranges

That’s how to load an index sitemap + all gziped sub-sitemaps into a Pandas dataframe:

from bs4 import BeautifulSoup
import requests
import gzip
from StringIO import StringIO
import urllib2
import pandas as pd
#load this index-sitemap
r = requests.get("")
xml = r.text
soup = BeautifulSoup(xml,"lxml")
sitemapTags = soup.find_all("sitemap")
urls_list = []
lastmod_list = []
changefreq_list = []
urls_total = 0
#loop to find all sub-sitemap URLs
for sitemap in sitemapTags:
subsitemap = sitemap.findNext("loc").text
#print subsitemap
#load and handle the gzip stuff to get a proper XML
request = urllib2.Request(subsitemap)
request.add_header('Accept-encoding', 'gzip')
response = urllib2.urlopen(request)

buf = StringIO(
f = gzip.GzipFile(fileobj=buf)
data =

soup2 = BeautifulSoup(str(data),"lxml")
urlTags = soup2.find_all("url")
urls_total = urls_total + int(len(urlTags))
#print str(urls_total) + " - " + str(len(urlTags))

#Loop URLs withing ever sub-sitemap
for url in urlTags:

df = pd.DataFrame({'urls_list': urls_list,'lastmod_list': lastmod_list,'changefreq_list': changefreq_list})

© 2020 Tobias Willmann