Improvements to parallel URL check

This commit is contained in:
Ben Edgington
2023-09-28 14:24:11 +01:00
parent 46091d999f
commit be03657b1b

View File

@@ -14,7 +14,7 @@ if [ ! -f "$1" ]; then
exit 1 exit 1
fi fi
file=$1 markdown_file=$1
# Number of concurrent checks # Number of concurrent checks
npara=8 npara=8
@@ -26,7 +26,7 @@ timeout=10
github_secret=$(cat $(dirname "$0")/../priv/github.txt) github_secret=$(cat $(dirname "$0")/../priv/github.txt)
# File to store non-200 URLs # File to store non-200 URLs
non_200_urls_file=$(mktemp) non_200_urls_tmp=$(mktemp)
# Where to find the book itself (for relative links that are really absolute) # Where to find the book itself (for relative links that are really absolute)
selfserver=https://eth2book.info selfserver=https://eth2book.info
@@ -47,25 +47,32 @@ check_url() {
if [ "200" -ne "$res" ] if [ "200" -ne "$res" ]
then then
echo "*** $res ***" echo "*** $res ***"
echo "$res $x" >> $non_200_urls_file echo "$res $x" >> $non_200_urls_tmp
fi fi
} }
export -f check_url export -f check_url
export timeout github_secret selfserver non_200_urls_file export timeout github_secret non_200_urls_tmp
# Extract URLs and pass them to check_url function in parallel # Extract URLs and pass them to check_url function in parallel
cat $file | sed "s|(/\.\.|($selfserver|g" | grep -Pho '\(\Khttp[^)]+' | sed 's/#.*$//g' | sort -u | xargs -P $npara -I {} bash -c 'check_url "$@"' _ {} cat $markdown_file \
| sed "s|(/\.\.|($selfserver|g" \
| grep -Pho '\(\Khttp[^)]+' \
| sed 's/#.*$//g' \
| sort -u \
| xargs -P $npara -I {} bash -c 'check_url "$@"' _ {}
# Print non-200 URLs # Print non-200 URLs
exit_code=0
echo echo
if [ -s $non_200_urls_file ] if [ -s $non_200_urls_tmp ]
then then
echo "*** Failing URLs: ***" echo "*** Failing URLs: ***"
cat $non_200_urls_file cat $non_200_urls_tmp
rm $non_200_urls_file exit_code=1
exit 1
else else
echo "*** All URLs are good ***" echo "*** All URLs are good ***"
exit 0
fi fi
rm $non_200_urls_tmp
exit $exit_code