new url_to_id and added dry-run in burrow

master
KatolaZ 6 years ago
parent 040ba18f7f
commit 1185679222
  1. 29
      README.md
  2. 21
      burrow
  3. 55
      url_to_id

@ -0,0 +1,29 @@
## Burrow-The-Burrows
A Gopher burrower in a shell script. By using `burrow` and a bit of
plumbing you can get all the links in a Gopher MENU, recursively visit
all the available subdirs, and create a directed graph of the visited
selectors.
`burrow` takes as input a gopher identifier, as generated by
`url_to_id`, which is considered a gophermap, and provides on stdout the
list of menu selectors found in that document. `burrow` will also dump
on stderr the list of all the edges (to any kind of selector) found in
that page, in the format:
src_SHA256 dst_SHA256
where `src_SHA256` is the SHA256 of the source selector (the current
document), while `dst_SHA256` is the destination selector (the pointed
document).
To start a crawl, one can do something like:
```
$ ./url_to_id gopher://your.gopher.url/ > ids
$ tail -f ids | parallel -j2 './burrow {}' 2>> graph.txt | tee -a ids >/dev/null &
```
Notice that `burrow` will create a certain number of folders in the
current directory, used to keep track of the selectors that have been
already retrieved.

@ -7,6 +7,19 @@
###
### where SHA256 is the SHA256SUM of "1|SELECTOR|HOST|PORT"
###
### *** DRY RUN ***
###
### If run as burrow?* (i.e., "burrow" followed by at least one
### character), burrow will run in DRY MODE, i.e., it will just check
### if the id provided as input exists, and then exit.
###
###------------------------------------------------
###
### (C) Vincenzo 'KatolaZ' Nicosia <katolaz@freaknet.org>
###
### Use, modify, redistribute under the terms of the GNU General
### Public License version 3 or, at your option, any other version.
###
## function
get_dirs(){
src_id="$1"
@ -56,7 +69,7 @@ retrieve_selector(){
check_selector_present(){
sel_id="$1"
sel_dir="$(echo ${sel_id} | cut -c -2)"
[ -d "${sel_dir}" -a -f "${sel_dir}/${sel_id}" ] && exit
[ -d "${sel_dir}" -a -f "${sel_dir}/${sel_id}" ] && echo "${SRC}" >>present && exit
## {
## if at least one of the neighbours of sel_id is missing, cat the entire list of
## neighbours to be re-visited and exit
@ -69,6 +82,8 @@ check_selector_present(){
[ ! -d "${sel_dir}" ] && mkdir -p "${sel_dir}"
}
[ $# -lt 1 ] && echo "Usage: $0 <gopherlink>" && exit 1
SRC="$1"
@ -78,5 +93,9 @@ check_selector_present "${src_id}"
echo "selector ${src_id} not found" >> logfile.txt
MYNAME=$(basename $0)
if [ -z "${MYNAME##burrow}" ]; then
retrieve_selector "$SRC" | sed -r -e 's/\t/|/g' | get_dirs "${src_id}"
fi

@ -1,15 +1,25 @@
#!/bin/sh
## get a selector in gph format:
##
## Get a gopherlink in the format:
##
## gopher://domain.org:port/*/my/cool/selector
##
## or a selector in gph format:
##
## [TYPE|SEL|HOST|PORT]
##
## and print on output the corresponding selectorid:
## and print on output the corresponding "unique" selectorid:
##
## TYPE|SEL|HOST|PORT|SHA256
##
## which is understood by `burrow`
###
### get a selector in gph format and transform it in a selectorid
###
## function
gph_to_id(){
gph="$( echo $1| sed 's/\[//g;s/\]//g')"
OLDIFS=$IFS
@ -20,4 +30,45 @@ gph_to_id(){
IFS="$OLDIFS"
}
###
### Get a gopherurl and transform it in a selectorid
###
## function
gopherurl_to_id(){
URL="$(echo $1 | sed 's,gopher://,,g')"
hostport=$(echo "$URL" | cut -d "/" -f 1)
host="$(echo $hostport | cut -d ":" -f 1)"
port="$(echo $hostport | cut -s -d ":" -f 2)"
[ -z "$port" ] && port='70'
type=$(echo "$URL" | cut -s -d "/" -f 2)
[ -z "$type" ] && {
type='1'
sel="/"
gph_to_id "[${type}|${sel}|${host}|$port]"
exit 0
}
[ -n "${type#?}" ] && echo "Invalid Gopher URL" >&2 && exit 1
## Check if type is a valid one
type="$(echo $type | sed -n '/^[0-9ITghis+]$/p')"
[ -z "${type}" ] && echo "Invalid Gopher URL" >&2 && exit 1
sel=/$(echo "$URL" | cut -s -d "/" -f 3-)
gph_to_id "[${type}|${sel}|${host}|$port]"
}
[ $# -lt 1 ] && echo "Usage: $0 <gopherurl>" && echo " $0 <gphselector>" && exit 1
[ -n "$(echo $1 | sed -n '/^gopher:\/\//p')" ] && {
gopherurl_to_id "$1"
exit 0
}
[ -n "$(echo $1 | sed -n '/^\[.*\]$/p')" ] && {
gph_to_id "$1"
exit 0
}
echo "No valid URL or gph selector provided" >&2
exit 1

Loading…
Cancel
Save