From 11856792221c7faa2f423bc106d1f4b1482bcdb8 Mon Sep 17 00:00:00 2001 From: KatolaZ Date: Wed, 1 Aug 2018 10:27:54 +0100 Subject: [PATCH] new url_to_id and added dry-run in burrow --- README.md | 29 ++++++++++++++++++++++++++++ burrow | 23 ++++++++++++++++++++-- url_to_id | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 104 insertions(+), 5 deletions(-) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..be73adc --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +## Burrow-The-Burrows + +A Gopher burrower in a shell script. By using `burrow` and a bit of +plumbing you can get all the links in a Gopher MENU, recursively visit +all the available subdirs, and create a directed graph of the visited +selectors. + +`burrow` takes as input a gopher identifier, as generated by +`url_to_id`, which is considered a gophermap, and provides on stdout the +list of menu selectors found in that document. `burrow` will also dump +on stderr the list of all the edges (to any kind of selector) found in +that page, in the format: + + src_SHA256 dst_SHA256 + +where `src_SHA256` is the SHA256 of the source selector (the current +document), while `dst_SHA256` is the destination selector (the pointed +document). + +To start a crawl, one can do something like: + +``` + $ ./url_to_id gopher://your.gopher.url/ > ids + $ tail -f ids | parallel -j2 './burrow {}' 2>> graph.txt | tee -a ids >/dev/null & +``` + +Notice that `burrow` will create a certain number of folders in the +current directory, used to keep track of the selectors that have been +already retrieved. diff --git a/burrow b/burrow index 2665d21..8138ca5 100755 --- a/burrow +++ b/burrow @@ -7,6 +7,19 @@ ### ### where SHA256 is the SHA256SUM of "1|SELECTOR|HOST|PORT" ### +### *** DRY RUN *** +### +### If run as burrow?* (i.e., "burrow" followed by at least one +### character), burrow will run in DRY MODE, i.e., it will just check +### if the id provided as input exists, and then exit. +### +###------------------------------------------------ +### +### (C) Vincenzo 'KatolaZ' Nicosia +### +### Use, modify, redistribute under the terms of the GNU General +### Public License version 3 or, at your option, any other version. +### ## function get_dirs(){ src_id="$1" @@ -56,7 +69,7 @@ retrieve_selector(){ check_selector_present(){ sel_id="$1" sel_dir="$(echo ${sel_id} | cut -c -2)" - [ -d "${sel_dir}" -a -f "${sel_dir}/${sel_id}" ] && exit + [ -d "${sel_dir}" -a -f "${sel_dir}/${sel_id}" ] && echo "${SRC}" >>present && exit ## { ## if at least one of the neighbours of sel_id is missing, cat the entire list of ## neighbours to be re-visited and exit @@ -69,6 +82,8 @@ check_selector_present(){ [ ! -d "${sel_dir}" ] && mkdir -p "${sel_dir}" } +[ $# -lt 1 ] && echo "Usage: $0 " && exit 1 + SRC="$1" @@ -78,5 +93,9 @@ check_selector_present "${src_id}" echo "selector ${src_id} not found" >> logfile.txt -retrieve_selector "$SRC" | sed -r -e 's/\t/|/g' | get_dirs "${src_id}" +MYNAME=$(basename $0) + +if [ -z "${MYNAME##burrow}" ]; then + retrieve_selector "$SRC" | sed -r -e 's/\t/|/g' | get_dirs "${src_id}" +fi diff --git a/url_to_id b/url_to_id index 69cc0fc..4d96442 100755 --- a/url_to_id +++ b/url_to_id @@ -1,15 +1,25 @@ #!/bin/sh -## get a selector in gph format: +## +## Get a gopherlink in the format: +## +## gopher://domain.org:port/*/my/cool/selector +## +## or a selector in gph format: ## ## [TYPE|SEL|HOST|PORT] ## -## and print on output the corresponding selectorid: +## and print on output the corresponding "unique" selectorid: ## ## TYPE|SEL|HOST|PORT|SHA256 ## ## which is understood by `burrow` + +### +### get a selector in gph format and transform it in a selectorid +### +## function gph_to_id(){ gph="$( echo $1| sed 's/\[//g;s/\]//g')" OLDIFS=$IFS @@ -20,4 +30,45 @@ gph_to_id(){ IFS="$OLDIFS" } -gph_to_id "$1" +### +### Get a gopherurl and transform it in a selectorid +### +## function +gopherurl_to_id(){ + URL="$(echo $1 | sed 's,gopher://,,g')" + hostport=$(echo "$URL" | cut -d "/" -f 1) + host="$(echo $hostport | cut -d ":" -f 1)" + port="$(echo $hostport | cut -s -d ":" -f 2)" + [ -z "$port" ] && port='70' + type=$(echo "$URL" | cut -s -d "/" -f 2) + [ -z "$type" ] && { + type='1' + sel="/" + gph_to_id "[${type}|${sel}|${host}|$port]" + exit 0 + } + [ -n "${type#?}" ] && echo "Invalid Gopher URL" >&2 && exit 1 + ## Check if type is a valid one + type="$(echo $type | sed -n '/^[0-9ITghis+]$/p')" + [ -z "${type}" ] && echo "Invalid Gopher URL" >&2 && exit 1 + sel=/$(echo "$URL" | cut -s -d "/" -f 3-) + gph_to_id "[${type}|${sel}|${host}|$port]" + +} + + + +[ $# -lt 1 ] && echo "Usage: $0 " && echo " $0 " && exit 1 + + +[ -n "$(echo $1 | sed -n '/^gopher:\/\//p')" ] && { + gopherurl_to_id "$1" + exit 0 +} + +[ -n "$(echo $1 | sed -n '/^\[.*\]$/p')" ] && { + gph_to_id "$1" + exit 0 +} +echo "No valid URL or gph selector provided" >&2 +exit 1