This is a patch from Reinier Post for Harvest 1.5.20. It possible to create an object for redirected and broken links. This can be used for checking links with harvest's gatherer. You can enable this functionality by doing: HARVEST_INCLUDE_REDIR=1; export HARVEST_INCLUDE_REDIR HARVEST_INCLUDE_BROKEN=1; export HARVEST_INCLUDE_BROKEN before running RunGatherer. kj ------------------------------------------------ My patches are attached, and I think they reflect everything I did to the C parts of the gatherer. They may not be fit for inclusion, because I didn't fully uinderstand the code when I was doing this, and I haven't tested the effect extensively. The patch makes the gatherer output additional objects for redirections and error documents, if the appropriate environment variables are set. What I was working on was a soif2soif converter that would add even more attributes. It works by introducing an additional step into the gdbm file manipulations on the gatherer side. For instance, I add reverse attributes. I was adding this to the gatherer side because the broker side code was much too hard for me to penetrate. -- Reinier diff -cr harvest-1.5.20/src/common/include/template.h harvest-1.5.20-rp/src/common/include/template.h *** harvest-1.5.20/src/common/include/template.h Fri Jun 12 16:35:26 1998 --- harvest-1.5.20-rp/src/common/include/template.h Mon Jun 21 17:08:39 1999 *************** *** 137,142 **** --- 137,144 ---- #define T_UPDATE "Update-Time" #define T_URL "URL" #define T_UREFS "URL-References" + #define T_URL_REDIRECT "URL-Redirect" + #define T_URL_ERROR "URL-Error" /* Backwards compatibility */ #define T_FILETYPE T_TYPE diff -cr harvest-1.5.20/src/common/include/version.h harvest-1.5.20-rp/src/common/include/version.h *** harvest-1.5.20/src/common/include/version.h Fri Jun 12 16:35:27 1998 --- harvest-1.5.20-rp/src/common/include/version.h Tue Jun 22 19:33:08 1999 *************** *** 3,7 **** */ #ifndef HARVEST_VERSION /* UPDATE VERSION HERE */ ! #define HARVEST_VERSION "1.5.19" #endif --- 3,7 ---- */ #ifndef HARVEST_VERSION /* UPDATE VERSION HERE */ ! #define HARVEST_VERSION "1.5.20-patched" #endif diff -cr harvest-1.5.20/src/gatherer/enumerate/enum.c harvest-1.5.20-rp/src/gatherer/enumerate/enum.c *** harvest-1.5.20/src/gatherer/enumerate/enum.c Fri Jun 12 16:35:51 1998 --- harvest-1.5.20-rp/src/gatherer/enumerate/enum.c Tue Jun 22 10:07:21 1999 *************** *** 358,364 **** char *stamp; /* the md5/timestamp from enum */ { Debug(40, 9, ("do_add: Saving %s, %s\n", name, stamp)); ! fprintf(fadd, "%s\t%s:%s\n", name, cur_attr, stamp); fflush(fadd); /* MUST flush */ } --- 358,371 ---- char *stamp; /* the md5/timestamp from enum */ { Debug(40, 9, ("do_add: Saving %s, %s\n", name, stamp)); ! /* by rp@win.tue.nl to accomodate redirect and error objects: */ ! if (!strncmp(stamp,T_URL_REDIRECT, strlen(T_URL_REDIRECT)) || ! !strncmp(stamp,T_URL_ERROR, strlen(T_URL_ERROR))) { ! fprintf(fadd, "%s\t%s\n", name, stamp); ! } ! else { ! fprintf(fadd, "%s\t%s:%s\n", name, cur_attr, stamp); ! } fflush(fadd); /* MUST flush */ } diff -cr harvest-1.5.20/src/gatherer/enumerate/httpenum-breadth.c harvest-1.5.20-rp/src/gatherer/enumerate/httpenum-breadth.c *** harvest-1.5.20/src/gatherer/enumerate/httpenum-breadth.c Fri Jun 12 16:35:53 1998 --- harvest-1.5.20-rp/src/gatherer/enumerate/httpenum-breadth.c Tue Jun 22 15:03:19 1999 *************** *** 128,133 **** --- 128,134 ---- #include #include "util.h" #include "url.h" + #include "template.h" #define PUBLIC extern #include "filter.h" *************** *** 158,163 **** --- 159,166 ---- int depth_hist[100]; /* Local variables */ + static int print_redirections = 0; + static int print_broken_urls = 0; static int url_max = 0; static int nurls = 0; static int host_max = 0; *************** *** 384,390 **** #ifdef HOST_COUNT_IP Host *h = NULL; #endif ! if (host_in_db(up->host)) /* Host is already in the db */ return (1); if (++nhosts > host_max) --- 387,401 ---- #ifdef HOST_COUNT_IP Host *h = NULL; #endif ! /* ! * for news: URLs, a NULL up->host is possible -> SEGV (on Solaris 2.5.1) ! * fix by rp@win.tue.nl, Jun 16 1999 ! * note: RobotsTxtCheck() has the same problem - not fixed, but we ! * avoid calling it by returning 0 here ! */ ! if (!up->host) ! return (0); ! /* end of fix */ if (host_in_db(up->host)) /* Host is already in the db */ return (1); if (++nhosts > host_max) *************** *** 498,505 **** xfree(s); s = NULL; - while(((status=url_retrieve(up)) == -1) && counturl)) return 0; } --- 509,523 ---- xfree(s); s = NULL; while(((status=url_retrieve(up)) == -1) && countredir_from_url) { + Debug(42, 1, ("Redirected URL: ??? -> %s\n", up->url)); + } else { + fprintf(stdout, "%s\t%s:%s\n", + up->redir_from_url, T_URL_REDIRECT, up->url); + } + } count++; if (!url_is_allowed(up->url)) return 0; } *************** *** 511,516 **** --- 529,538 ---- mark_failed(up); #endif + if (print_broken_urls) { + fprintf(stdout, "%s\t%s:%d\n", up->url, T_URL_ERROR, status); + } + return 0; } if (up->md5 && md5_in_db(up->md5)) { /* Have we been here? */ *************** *** 624,629 **** --- 646,655 ---- max_depth = atoi(s); if ((s = getenv("HARVEST_DEPTH_CUR")) != NULL) cur_depth = atoi(s); + if ((s = getenv("HARVEST_INCLUDE_REDIR")) != NULL) + print_redirections = atoi(s); + if ((s = getenv("HARVEST_INCLUDE_BROKEN")) != NULL) + print_broken_urls = atoi(s); if (url_max < 1) url_max = 250; /* hard-coded maximum */ if (host_max < 1) *************** *** 784,791 **** } Debug(42, 1, ("Processing: [%2d] %s\n", depth, url)); ! if ((up = url_open(url)) == NULL) continue; if ((up->type != URL_HTTP)) { Debug(42, 1, ("Examining: [%d:%d] %s\n", depth, max_depth, up->url)); --- 810,821 ---- } Debug(42, 1, ("Processing: [%2d] %s\n", depth, url)); ! if ((up = url_open(url)) == NULL) { ! if (print_broken_urls) { ! fprintf(stdout, "%s\t%s:0\n", T_URL_ERROR, up->url); ! } continue; + } if ((up->type != URL_HTTP)) { Debug(42, 1, ("Examining: [%d:%d] %s\n", depth, max_depth, up->url)); diff -cr harvest-1.5.20/src/gatherer/enumerate/httpenum-depth.c harvest-1.5.20-rp/src/gatherer/enumerate/httpenum-depth.c *** harvest-1.5.20/src/gatherer/enumerate/httpenum-depth.c Fri Jun 12 16:35:53 1998 --- harvest-1.5.20-rp/src/gatherer/enumerate/httpenum-depth.c Tue Jun 22 15:02:59 1999 *************** *** 47,52 **** --- 47,53 ---- #include #include "util.h" #include "url.h" + #include "template.h" #define PUBLIC extern #include "filter.h" *************** *** 70,75 **** --- 71,78 ---- int depth_hist[100]; /* Local variables */ + static int print_redirections = 0; + static int print_broken_urls = 0; static int url_max = 0; static int nurls = 0; static int host_max = 0; *************** *** 180,185 **** --- 183,192 ---- datum k; int r; + if (url == NULL) { + fprintf(stderr,"null url in url_in_db\n"); + return 0; + } Debug(42, 9, ("url_in_db: checking for url='%s'\n", url)); k.dptr = xstrdup(url); *************** *** 240,246 **** #ifdef HOST_COUNT_IP Host *h = NULL; #endif ! if (host_in_db(up->host)) /* Host is already in the db */ return (1); if (++nhosts > host_max) --- 247,261 ---- #ifdef HOST_COUNT_IP Host *h = NULL; #endif ! /* ! * for news: URLs, a NULL up->host is possible ! * fix by rp@win.tue.nl, Jun 16 1999 ! * note: RobotsTxtCheck() has the same problem - not fixed, but we ! * avoid calling it by returning 0 here ! */ ! if (!up->host) ! return (0); ! /* end of fix */ if (host_in_db(up->host)) /* Host is already in the db */ return (1); if (++nhosts > host_max) *************** *** 298,303 **** --- 313,319 ---- return 1; } + /* * http_enum() - Returns a linked list of all the URLs in this object, * or NULL on error. Checks for "text/html" in MIME headers and then *************** *** 349,356 **** /* Recurse over the redirect chain */ while(((status=url_retrieve(up)) == -1) && countredir_from_url) { ! Debug(42, 1, ("Redirected URL: ??? -> %s\n", up->url)); ! } else { ! fprintf(stdout, "%s\t%s:%s\n", ! up->redir_from_url, T_URL_REDIRECT, up->url); ! } ! } ! count++; ! if (!url_is_allowed(up)) return 0; } if (status) { /* Grab the URL; success? */ *************** *** 360,365 **** --- 384,393 ---- mark_failed(up); #endif + if (print_broken_urls) { + fprintf(stdout, "%s\t%s:%d\n", up->url, T_URL_ERROR, status); + } + return (NULL); } if (up->md5 && md5_in_db(up->md5)) { /* Have we been here? */ *************** *** 550,556 **** process_url(tup, depth + 1); } } ! /* Free List Entry */ xfree(l->ptr); l->ptr = (void *) NULL; --- 578,586 ---- process_url(tup, depth + 1); } } ! else if ((tup == NULL) && print_broken_urls) { ! fprintf(stdout, "%s\t%s:0\n", up->url, T_URL_ERROR); ! } /* Free List Entry */ xfree(l->ptr); l->ptr = (void *) NULL; *************** *** 584,589 **** --- 614,623 ---- max_depth = atoi(s); if ((s = getenv("HARVEST_DEPTH_CUR")) != NULL) cur_depth = atoi(s); + if ((s = getenv("HARVEST_INCLUDE_REDIR")) != NULL) + print_redirections = atoi(s); + if ((s = getenv("HARVEST_INCLUDE_BROKEN")) != NULL) + print_broken_urls = atoi(s); if (url_max < 1) url_max = 250; /* hard-coded maximum */ if (host_max < 1) diff -cr harvest-1.5.20/src/gatherer/enumerate/staturl.c harvest-1.5.20-rp/src/gatherer/enumerate/staturl.c *** harvest-1.5.20/src/gatherer/enumerate/staturl.c Fri Jun 12 16:35:53 1998 --- harvest-1.5.20-rp/src/gatherer/enumerate/staturl.c Mon Jun 21 17:18:53 1999 *************** *** 138,143 **** --- 138,146 ---- extern int liburl_conform_rfc1738; FILE *logfp = NULL; int delay = 0; + int print_redirections = 0; + int print_broken_urls = 0; + int status; if (getenv("HARVEST_GATHERER_LOGFILE") != (char *) NULL) logfp = fopen(getenv("HARVEST_GATHERER_LOGFILE"), "a+"); *************** *** 152,157 **** --- 155,166 ---- if ((s = getenv("HARVEST_URL_DELAY"))) delay = atoi(s); + if ((s = getenv("HARVEST_INCLUDE_REDIR"))) + print_redirections = atoi(s); + + if ((s = getenv("HARVEST_INCLUDE_BROKEN"))) + print_broken_urls = atoi(s); + #ifdef HAVE_SETLINEBUF setlinebuf(stdout); /* don't keep pipe waiting */ setlinebuf(stderr); *************** *** 178,183 **** --- 187,195 ---- Debug(41, 1, ("staturl: trying to process: %s\n", buf)); if ((up = url_open(buf)) == NULL) { + if (print_broken_urls) { + printf("%s\t%s:0\n", buf, T_URL_ERROR); + } continue; } if (up->type == URL_FILE) { *************** *** 189,195 **** url_close(up); continue; } ! if (url_retrieve(up)) { url_close(up); continue; } --- 201,219 ---- url_close(up); continue; } ! if ((status = url_retrieve(up))) { ! /* added HARVEST_INCLUDE_* support, Jun 21, 1999 ! for unknown reasons, staturl doesn't follow ! Redirect chains, while httpenum_* do - rp@win.tue.nl ! */ ! if ((status == -1) && print_redirections ! && up->redir_from_url) { ! printf("%s\t%s:%s\n", up->redir_from_url, ! up->url, T_URL_REDIRECT); ! } ! else if (print_broken_urls) { ! printf("%s\t%s:0\n", up->url, T_URL_ERROR); ! } url_close(up); continue; } diff -cr harvest-1.5.20/src/gatherer/essence/db.c harvest-1.5.20-rp/src/gatherer/essence/db.c *** harvest-1.5.20/src/gatherer/essence/db.c Fri Jun 12 16:35:56 1998 --- harvest-1.5.20-rp/src/gatherer/essence/db.c Tue Jun 22 22:28:18 1999 *************** *** 257,266 **** * is non-zero, then the template is appended to any existing * template data for the URL. */ void add_template(template, object) ! Template *template; ! DataObject *object; { datum k, d; Buffer *b = NULL; Template *ct = NULL; --- 257,282 ---- * is non-zero, then the template is appended to any existing * template data for the URL. */ + static void add_template_with_flags(); + void add_template(template, object) ! Template *template; ! DataObject *object; { + add_template_with_flags(template,object->flags); + } + + void add_template_nonnested(template) + Template *template; + { + + add_template_with_flags(template, 0); + } + + static void add_template_with_flags(template, flags) + Template *template; + unsigned int flags; + { datum k, d; Buffer *b = NULL; Template *ct = NULL; *************** *** 276,282 **** datum curd; /* If a template already exists, then check nested file. */ ! if ((object->flags & F_NESTED) == 0) { errorlog("Existing GDBM Entry for non-nested %s\n", template->url); xfree(k.dptr); --- 292,298 ---- datum curd; /* If a template already exists, then check nested file. */ ! if ((flags & F_NESTED) == 0) { errorlog("Existing GDBM Entry for non-nested %s\n", template->url); xfree(k.dptr); diff -cr harvest-1.5.20/src/gatherer/essence/main.c harvest-1.5.20-rp/src/gatherer/essence/main.c *** harvest-1.5.20/src/gatherer/essence/main.c Fri Jun 12 16:35:57 1998 --- harvest-1.5.20-rp/src/gatherer/essence/main.c Thu Jun 24 10:02:30 1999 *************** *** 471,476 **** --- 471,478 ---- } /* * The input looks like: + * URLURL-Redirect:URL + * URLURL-Broken:12 * URLMD5:adfasdfasdfasdfasdfasd * URLLast-Modification-Time:12345 */ *************** *** 505,516 **** if ((s = strchr(buf, '\t')) != NULL) { *s++ = '\0'; /* delineate at the tab */ } /* * For MD5's: check database and skip if unchanged * For LMT's: check database and skip if unchanged * For no meta data, just pass it through */ ! if (s && !strncasecmp(s, T_MD5, strlen(T_MD5))) { if (dbcheck_md5(buf, s + strlen(T_MD5) + 1)) { continue; } --- 507,531 ---- if ((s = strchr(buf, '\t')) != NULL) { *s++ = '\0'; /* delineate at the tab */ } + /* + * special cases: stamps on broken URLs or redirections + */ + if (s && !strncasecmp(s, T_URL_REDIRECT, + strlen(T_URL_REDIRECT)) && + s[strlen(T_URL_REDIRECT)] == ':') { + summarize_empty_object(buf,T_URL_REDIRECT,&s[strlen(T_URL_REDIRECT)+1]); + continue; + } else if (s && !strncasecmp(s, T_URL_ERROR, + strlen(T_URL_ERROR)) && + s[strlen(T_URL_ERROR)] == ':') { + summarize_empty_object(buf,T_URL_ERROR,&s[strlen(T_URL_ERROR)+1]); + continue; /* * For MD5's: check database and skip if unchanged * For LMT's: check database and skip if unchanged * For no meta data, just pass it through */ ! } else if (s && !strncasecmp(s, T_MD5, strlen(T_MD5))) { if (dbcheck_md5(buf, s + strlen(T_MD5) + 1)) { continue; } diff -cr harvest-1.5.20/src/gatherer/essence/summarize.c harvest-1.5.20-rp/src/gatherer/essence/summarize.c *** harvest-1.5.20/src/gatherer/essence/summarize.c Fri Jun 12 16:35:58 1998 --- harvest-1.5.20-rp/src/gatherer/essence/summarize.c Wed Jun 23 09:45:25 1999 *************** *** 943,945 **** --- 943,986 ---- } return; } + + /* + * summarize_empty_object() - by rp@win.tue.nl, June, 1999 + * part of a kludge to support the addition of SOIF objects for URLs + * that are broken of represent redirections + * + * the point is to generate SOIF output without having to create an 'object' + * so we copy the relevant parts of process_url(), summarize(), + * generate_oid(), etcetera + */ + void summarize_empty_object(url, type, value) + char *url; + char *type; + char *value; + { + Template *template = NULL; + struct OID *oid = NULL; + int pp_code = 0; + + Debug(62, 1, ("summarize_empty_object(%s,%s,%s)\n", + url, type, value)); + + /* what follows is simplified from summarize.c:summarize_file() */ + oid = new_oid(url, + gatherer_id, time(NULL), default_ttl, default_refresh); + template = create_template_with_oid(NULL, url, oid); + /* Add some other known Attributes */ + add_AVList(template->list, type, value, strlen(value)); + /* no quicksum - fix this later? - rp */ + /* no external summary - fix this - rp */ + mkgid(template); + pp_code = post_process(template); + if (pp_code == SUMMARIZE_DONT_ADD_OBJECT) { + Debug(64, 1, ("NOT adding %s to the database\n", url)); + } else { + add_template_nonnested(template); + } + /* clean up */ + free_template(template); + free_oid(oid); + } diff -cr harvest-1.5.20/src/gatherer/include/db.h harvest-1.5.20-rp/src/gatherer/include/db.h *** harvest-1.5.20/src/gatherer/include/db.h Fri Jun 12 16:36:27 1998 --- harvest-1.5.20-rp/src/gatherer/include/db.h Tue Jun 22 22:25:08 1999 *************** *** 90,95 **** --- 90,96 ---- void init_db _PARAMS((char *, int)); void add_template _PARAMS((Template *, DataObject *)); + void add_template_nonnested _PARAMS((Template *)); int duplicate_url _PARAMS((char *)); int duplicate_url_any _PARAMS((char *)); void finish_db _PARAMS(()); diff -cr harvest-1.5.20/src/gatherer/include/summarize.h harvest-1.5.20-rp/src/gatherer/include/summarize.h *** harvest-1.5.20/src/gatherer/include/summarize.h Fri Jun 12 16:36:28 1998 --- harvest-1.5.20-rp/src/gatherer/include/summarize.h Tue Jun 22 22:29:12 1999 *************** *** 92,97 **** --- 92,98 ---- void init_summarize _PARAMS(()); int summarize _PARAMS((DataObject *)); int summarize_nested_object _PARAMS((DataObject *)); + void summarize_empty_object _PARAMS((char *url,char *type,char *value)); void finish_summarize _PARAMS(()); #endif /* _SUMMARIZE_H_ */