Checking links on a website with Python
Dead links are a common problem for live websites. You can use Python to check them. The requests
and Beautiful Soup libraries are great for loading pages as HTML and parsing them.
Example script
The following check_links_on_website.py
script performs the following given a URL called the starting page:
- Get all
href
links from<a>
tags on the starting page. - Get the status code for each
href
link. A status code of200
means the link loads OK. - If you want, recursively perform steps 1 and 2 on all pages from the same website that the starting page links to.
- Output the results to the console and a
.csv
file in the format:page checked, status code, href, display text in <a> tag, absolute url checked for status code
.
The script gets the status code for each href
link even if the link is to a different website, but it limits the recursive check to only links on the same website domain of the starting page. For example, if your domain.tld
website links to domain.tld/page
and wikipedia.org/page
, then the script can check all the links on domain.tld/page
, but not on wikipedia.org/page
.
#! /usr/bin/python3
##
# Python 3.8.10
# WSL: Ubuntu-20.04
# Copyright 2022 Theodore Chu
# This source code is licensed under the MIT license
# available at https://theochu.com/MIT-License.txt
##
import argparse, re, traceback, requests
from os import path
from bs4 import BeautifulSoup
from io import TextIOWrapper
STARTING_PAGE = "https://theochu.com/about/"
DOMAIN = "theochu.com"
HTTPS = "https://"
BASE_URL = HTTPS + DOMAIN
FILE_OUT = "check_links_on_website"
FILE_OUT_CSV = FILE_OUT + ".csv"
ABSOLUTE_URL = "Absolute url of link"
DISPLAY_TEXT = "Display text in <a> tag"
HREF = "HREF link in the <a> tag"
STATUS_CODE = "Status code of link"
URL = "URL of page"
TYPE_NAVIGABLE_STRING = "<class 'bs4.element.NavigableString'>"
REGEXP_ANCHOR_LINK = "#(.*)$"
REGEXP_HTTP = "http[s]?://"
REGEXP_HTTP_LINK_WITH_PATH = "http[s]?://(.*?)/"
pages_checked = []
pages_to_check = []
status_codes = {}
def get_domain(url: str):
"""
Get the domain of a website given its url. For example,
if given https://subdomain.domain.tld/path/to/page or
http://subdomain.domain.tld, then return subdomain.domain.tld
"""
matches = re.findall(REGEXP_HTTP_LINK_WITH_PATH, url)
if matches:
return matches[-1]
else:
return re.sub(REGEXP_HTTP, "", url)
def get_absolute_url(base_url: str, path: str):
"Get absolute url given base_url and path"
if path == "#" or path == "/":
return base_url
if path.startswith("/") or path.startswith("#"):
return base_url + path
else:
return path
def get_links_on_page(
url: str,
base_url: str,
file_out: TextIOWrapper,
output_only_bad_links: bool,
):
"""
Get the links on a page. Check them and output the
result to the console and output file.
"""
print("Checking links for", url)
pages_checked.append(url)
pages_to_check.remove(url)
try:
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")
a_tags = soup.find_all("a")
for link in a_tags:
href = link.get("href")
"""
Checking links doesn't work for same-page links because
if the page returns a status code of 200,
then the same-page link return 200 as well
"""
if href.startswith("#"):
pass
result = {
URL: url,
STATUS_CODE: "",
HREF: "",
DISPLAY_TEXT: "",
ABSOLUTE_URL: "",
}
result[HREF] = href
if str(link.string) != "None":
result[DISPLAY_TEXT] = link.string
else:
for content in link.contents:
if str(type(content)) == TYPE_NAVIGABLE_STRING:
result[DISPLAY_TEXT] = str(content)
absolute_url = get_absolute_url(base_url, href)
result[ABSOLUTE_URL] = absolute_url
# Don't check if the link is on localhost
if "localhost:" not in href:
if absolute_url in status_codes:
result[STATUS_CODE] = status_codes[absolute_url]
else:
status_code = requests.get(absolute_url).status_code
status_codes[absolute_url] = status_code
result[STATUS_CODE] = status_code
if href != "/" and href != "#":
if (
absolute_url.startswith(base_url)
and not re.search(REGEXP_ANCHOR_LINK, absolute_url)
and absolute_url not in pages_checked
and absolute_url not in pages_to_check
):
pages_to_check.append(absolute_url)
if result[STATUS_CODE] != 200 or not output_only_bad_links:
output_result(result, file_out)
except Exception as e:
print("Error checking", url)
print(e)
track = traceback.format_exc()
print(track)
def output_result(result: dict, file_out: TextIOWrapper):
output_array = [
result[URL],
str(result[STATUS_CODE]),
result[HREF],
result[DISPLAY_TEXT],
result[ABSOLUTE_URL],
]
print(", ".join(output_array))
file_out.write(", ".join(output_array) + "\n")
def check_links_on_website(
starting_page=STARTING_PAGE,
base_url=BASE_URL,
file_out=FILE_OUT_CSV,
all_pages=False,
output_only_bad_links=False,
):
pages_to_check.append(starting_page)
file_out = open(file_out, "w")
header = [URL, STATUS_CODE, HREF, DISPLAY_TEXT, ABSOLUTE_URL]
file_out.write(", ".join(header) + "\n")
while pages_to_check:
get_links_on_page(
pages_to_check[0], base_url, file_out, output_only_bad_links
)
if not all_pages:
break
print("Pages left to check:", pages_to_check)
file_out.close()
def main():
parser = argparse.ArgumentParser(
description="Checks links on a website and "
+ "output status codes to a CSV file."
)
parser.add_argument(
"-s",
"--starting_page",
type=str,
help="The url to check (default: {})".format(STARTING_PAGE),
default=STARTING_PAGE,
)
parser.add_argument(
"-o",
"--output_file",
type=str,
help="The output .csv file (default: {})".format(
FILE_OUT + "_domain" + ".csv"
),
)
parser.add_argument(
"-r",
"--recursive",
action="store_true",
help="Recursively check entire website (default: False)",
default=False,
)
parser.add_argument(
"-oobl",
"--output_only_bad_links",
action="store_true",
help="Output links only with status_code != 200 (default: False)",
default=False,
)
args = parser.parse_args()
starting_page = args.starting_page
if not re.match(REGEXP_HTTP, starting_page):
starting_page = HTTPS + starting_page
domain = get_domain(starting_page)
base_url = HTTPS + domain
if args.output_file == None:
file_out = FILE_OUT + "_" + domain + ".csv"
if path.exists(file_out):
print("WARNING. The file '" + file_out + "' already exists.")
print("You're about to overwrite the file.")
confirmation = input("Are you sure you want to proceed? (y/n) ")
if not (confirmation.lower() == "y"):
print("You didn't input 'y' or 'Y'. Exiting...")
exit()
print("You inputted '" + confirmation + "'. Proceeding...")
check_links_on_website(
starting_page=starting_page,
base_url=base_url,
file_out=file_out,
all_pages=args.recursive,
output_only_bad_links=args.output_only_bad_links,
)
if __name__ == "__main__":
main()
Using the script
Before you use the example script, read its limitations. To use the script:
-
Install the
requests
andbs4
(Beautiful Soup) libraries:pip install requests bs4
-
Create a file called
check_links_on_website.py
and paste the script in it. Save the file. -
Run the script, replacing
https://example.com
with the page you want to check.python3 check_links_on_website.py -s https://example.com
To stop the script while it's running, press Ctrl+C on your keyboard.
Options
The script has the following options:
- To specify a starting page, run the script with
-h <https://domain.tld>
. - To specify an output file, run the script with
-o <outputfile.csv>
. - To recursively check all pages linked from the starting page, run the script with the
-r
or--recursive
flag. - To output only links with status codes that aren't
200
, run the script with the-oobl
or--output_only_bad_links
flag.
For more information on the options, run the script with the --help
command:
python3 check_links_on_website.py --help
Limitations
The example script has the following limitations:
- The script executes synchronously, so if you want it to recursively check a large website, it might take a while.
- If the script finds an
href
link with a root-based path instead of an absolute URL, such as/path/to/page
, then the script checks that page withhttps://
instead ofhttp://
. This might be a problem if the page doesn't supporthttps://
. - The script might not work with relative paths such as
./path/to/page
,../path/to/page
, or../../path/to/page
. - If you stop the script while it's executing recursively, then it restarts completely if you run it again on the same website. The script doesn't yet pick up where it left off.
- The script focuses on retrieving status codes, so it doesn't detect the validity of anchor links. If you want to check links in Markdown code, consider using remark-validate-links instead.