head	1.5;
access;
symbols
	RELEASE_7_2_0:1.4
	RELEASE_7_1_0:1.4
	RELEASE_6_4_0:1.4
	RELEASE_5_EOL:1.4
	RELEASE_7_0_0:1.4
	RELEASE_6_3_0:1.4
	PRE_XORG_7:1.4
	RELEASE_4_EOL:1.4
	RELEASE_6_2_0:1.4
	RELEASE_6_1_0:1.4
	RELEASE_5_5_0:1.4
	RELEASE_6_0_0:1.4
	RELEASE_5_4_0:1.4
	RELEASE_4_11_0:1.4
	RELEASE_5_3_0:1.4
	RELEASE_4_10_0:1.4
	RELEASE_5_2_1:1.4
	RELEASE_5_2_0:1.4
	RELEASE_4_9_0:1.4
	RELEASE_5_1_0:1.4
	RELEASE_4_8_0:1.4
	RELEASE_5_0_0:1.4
	RELEASE_4_7_0:1.4
	RELEASE_4_6_2:1.4
	RELEASE_4_6_1:1.4
	RELEASE_4_6_0:1.4
	RELEASE_5_0_DP1:1.4
	RELEASE_4_5_0:1.4
	RELEASE_4_4_0:1.4
	RELEASE_4_3_0:1.4
	RELEASE_4_2_0:1.4
	RELEASE_4_1_1:1.4
	RELEASE_4_1_0:1.4
	RELEASE_3_5_0:1.4
	RELEASE_4_0_0:1.4
	RELEASE_3_4_0:1.4
	RELEASE_3_3_0:1.4
	RELEASE_3_2_0:1.4
	RELEASE_3_1_0:1.4
	RELEASE_2_2_8:1.4;
locks; strict;
comment	@# @;


1.5
date	2009.07.08.02.35.36;	author pgollucci;	state dead;
branches;
next	1.4;

1.4
date	98.11.04.04.17.02;	author ache;	state Exp;
branches;
next	1.3;

1.3
date	98.11.02.20.04.29;	author ache;	state Exp;
branches;
next	1.2;

1.2
date	98.11.02.08.54.49;	author ache;	state Exp;
branches;
next	1.1;

1.1
date	98.11.02.08.52.05;	author ache;	state Exp;
branches;
next	;


desc
@@


1.5
log
@- patch file shuffle for my sanity
@
text
@--- makenh.orig	Tue Jul 28 03:21:30 1998
+++ makenh	Wed Nov  4 07:05:47 1998
@@@@ -68,6 +68,7 @@@@
 $SITE_RE = '[^:]+:\/\/([^\/]+)\/.*';
 $NumLocalCollected = 0;
 $NumRemoteCollected = 0;
+$max_redir = 6;
 # LOGFILE, ERRFILE -- files for logging
 
 ### *TO CHANGE TRAVERSAL*
@@@@ -105,6 +106,7 @@@@
 $LOGFILENAME = ".wg_log";
 # $STARTFILE = ".wgstart";
 $WGADDSEARCH = ".wgfilter-box";
+$SITECACHE = ".wgsitecache";
 
 $ROBOTNAME = "HTTPGET";
 
@@@@ -187,22 +189,22 @@@@
 
 # Initialize variables to avoid warnings
    ($title, $urlpath, $traverse_type, $explicit_only, $numhops,
-    $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem) = 
-   ('','','','','','','','','','','');
+    $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, $locale, $charset) =
+   ('','','','','','','','','','','','','');
 
 ($title, $urlpath, $traverse_type, $explicit_only, $numhops,
- $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, @@urllist) = ReadConfig($archivepwd);
+ $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, $locale, $charset, @@urllist) = ReadConfig($archivepwd);
 
 # open logs
 &open_logs();
 
 print LOGFILE "From Configuration:\n";
 my(@@configlist) = qw(title urlpath traverse_type explicit_only numhops
-	nhhops local_limit remote_limit addboxes vhost usemaxmem ) ;
+	nhhops local_limit remote_limit addboxes vhost usemaxmem locale charset) ;
 foreach $item (@@configlist) 
 {
 	$value = '';
-	eval "$value = \$$item";
+	eval "\$value = \$$item";
 	print LOGFILE " $item: $value\n";
 }
 print LOGFILE " urllist: @@urllist\n\n";
@@@@ -231,6 +233,7 @@@@
 $MAPFILE = "$archivepwd/$MAPFILE";
 $TEMPROBOTFILE = "$archivepwd/$TEMPROBOTFILE";
 $WGADDSEARCH = "$archivepwd/$WGADDSEARCH";
+$SITECACHE = "$archivepwd/$SITECACHE";
 
 ($archiveprot, $archivehost, $archiveport, $archivepath) = 
    &url::parse_url($archiveurl);
@@@@ -252,7 +255,7 @@@@
 
 # read in the site configuration
 &siteconf::ReadConf($vhost);
-&siteconf::LoadCache();
+&siteconf::LoadCache("$SITECACHE");
 
 ###############
 ### PHASE 1 ###
@@@@ -398,7 +401,7 @@@@
 &close_logs();
 
 # remove the robots file
-system("rm -rf $TEMPROBOTFILE");
+unlink($TEMPROBOTFILE);
 
 #----------------------
 #change the dir back
@@@@ -751,7 +754,7 @@@@
    my($prot, $host, $port, $path) = &url::parse_url($url);
    
    # if the protocol isn't http, assume it's good
-   if($prot!~/http/i){
+   if(!defined($prot) || $prot!~/http/i){
       return 1;
    }
    
@@@@ -800,6 +803,7 @@@@
    my($output);
    my($olddata, $newdata);
    my($newprot, $newhost, $newport, $newpath, $url);
+   my($redcount)=0;
    
    # make the $url
    $url = "http://$host:$port/robots.txt";
@@@@ -815,6 +819,7 @@@@
    while($output ne ""){
       # more for error?
       if($output=~/^error/i){
+	 truncate($TEMPROBOTFILE,0);
 	 print ERRFILE "Error with getting $url\n";
 	 #			print LOGFILE "Error with getting $url\n";
 	 last;
@@@@ -822,7 +827,13 @@@@
       
       # look at output for redirect -- store redirects in file, too
       if($output=~/^Redirect: (.*)$/){
-	 print LOGFILE "Redirected to: $1...";
+	 if ($redcount >= $max_redir) {
+	     truncate($TEMPROBOTFILE,0);
+	     print ERRFILE "Too many redirections with $url\n";
+	     last;
+	 }
+	 $redcount++;
+	 print LOGFILE "Redirected to: $1...\n";
 	 
 	 # see if we have the redirected server
 	 ($newprot, $newhost, $newport, $newpath) = &url::parse_url($1);
@@@@ -843,6 +854,7 @@@@
 	 }
       }else{
 	 # we've got it, or there's an error...
+	 truncate($TEMPROBOTFILE,0);
 	 last;
       }
    }
@@@@ -894,6 +906,7 @@@@
 sub geturl2file{
    my($url) = @@_;
    my($output, $link, $file, $oldfile, @@aliases);
+   my($redcount)=0;
    
    # check if we have that in stock (we know it's not local)
    if (defined($URL2FILE{$url})) {
@@@@ -930,6 +943,7 @@@@
       while($output ne ""){
 	 # more for error?
 	 if($output=~/^error/i){
+	    truncate($file,0);
 	    print ERRFILE "Error with getting $url: $output\n";
 	    #				print LOGFILE "Error with getting $url\n";
 	    last;
@@@@ -937,6 +951,12 @@@@
 	 
 	 # look at output for redirect -- store redirects in file, too
 	 if($output=~/^Redirect: (.*)$/){
+	    if ($redcount >= $max_redir) {
+		truncate($file,0);
+		print ERRFILE "Too many redirections with $url\n";
+		last;
+	    }
+	    $redcount++;
 	    &ungetnewname();	# rewind the name counter		
 				# The next get will overwrite the unnecessary file
 	    
@@@@ -970,6 +990,7 @@@@
 	    }
 	 }else{
 	    # we've got it, or there's an error...
+	    truncate($file,0);
 	    last;
 	 }
       }
@@@@ -1159,6 +1180,15 @@@@
       ($prot, $host, $port, $path) = &url::parse_url($url);
       #print "URL after parsing: $prot://$host:$port$path\n";
       
+      next if !defined($prot);
+      if (!defined($port) ||
+	  ($port eq '80' && $prot =~ /^https?$/) ||
+	  ($port eq '21' && $prot eq 'ftp')) {
+	$port = '';
+      } else {
+	$port = ":$port";
+      }
+
       # make sure the path has a preceding /
       $path = "/$path" if $path!~/^\//;
       
@@@@ -1177,7 +1207,7 @@@@
 #      $host = "$a.$b.$c.$d";
 #      }
       
-      $url = "$prot://$host:$port$path";
+      $url = "$prot://$host$port$path";
       #print "URL after normalization: $url\n";
       
       # strip off any #text
@


1.4
log
@autosense localization from env vars
always put site cache locally
@
text
@@


1.3
log
@Localize it
@
text
@d2 1
a2 1
+++ makenh	Mon Nov  2 19:55:02 1998
d11 9
a19 1
@@@@ -187,22 +188,22 @@@@
d47 18
a64 1
@@@@ -398,7 +399,7 @@@@
d73 1
a73 1
@@@@ -751,7 +752,7 @@@@
d82 1
a82 1
@@@@ -800,6 +801,7 @@@@
d90 1
a90 1
@@@@ -815,6 +817,7 @@@@
d98 1
a98 1
@@@@ -822,7 +825,13 @@@@
d113 1
a113 1
@@@@ -843,6 +852,7 @@@@
d121 1
a121 1
@@@@ -894,6 +904,7 @@@@
d129 1
a129 1
@@@@ -930,6 +941,7 @@@@
d137 1
a137 1
@@@@ -937,6 +949,12 @@@@
d150 1
a150 1
@@@@ -970,6 +988,7 @@@@
d158 1
a158 1
@@@@ -1159,6 +1178,15 @@@@
d174 1
a174 1
@@@@ -1177,7 +1205,7 @@@@
@


1.2
log
@typo
@
text
@d2 1
a2 1
+++ makenh	Mon Nov  2 11:46:14 1998
d11 28
d133 25
@


1.1
log
@prevent redirection loop and indexing error pages
@
text
@d25 1
a25 1
+   if(defined($prot) && $prot!~/http/i){
@

