head	1.2;
access;
symbols
	RELEASE_7_4_0:1.1
	RELEASE_8_2_0:1.1
	RELEASE_6_EOL:1.1
	RELEASE_8_1_0:1.1
	RELEASE_7_3_0:1.1
	RELEASE_8_0_0:1.1;
locks; strict;
comment	@# @;


1.2
date	2011.09.07.09.30.14;	author bapt;	state dead;
branches;
next	1.1;

1.1
date	2009.07.08.02.35.36;	author pgollucci;	state Exp;
branches;
next	;


desc
@@


1.2
log
@2011-09-01 x11-wm/epiwm: Abandonware
2011-09-01 x11-toolkits/p5-GdkPixbuf: Disappear from CPAN
2011-09-01 x11-toolkits/XawPlus: No more public distfiles
2011-09-01 x11-fonts/gnome-font-sampler: No more public distfiles
2011-09-01 x11/xvattr: No more public distfiles
2011-09-01 www/xitami: No more public distfiles
2011-09-01 www/webredirect: No more public distfiles
2011-09-01 www/webglimpse: No more public distfiles
2011-09-01 www/squishdot: No more public distfiles
2011-09-01 www/py-forgethtml: No more public distfiles
2011-09-01 www/mmosaic: No more public distfiles
2011-09-01 www/ljsm: No more public distfiles
2011-09-01 www/ljpms: No more public distfiles
2011-09-01 www/ashe: No more public distfiles
2011-09-01 textproc/europass-xsl: BROKEN for more than 6 month
2011-09-01 textproc/carthage: No more public distfiles
@
text
@--- ./makenh.orig	1998-07-27 19:21:30.000000000 -0400
+++ ./makenh	2009-07-07 22:30:26.517066791 -0400
@@@@ -68,6 +68,7 @@@@
 $SITE_RE = '[^:]+:\/\/([^\/]+)\/.*';
 $NumLocalCollected = 0;
 $NumRemoteCollected = 0;
+$max_redir = 6;
 # LOGFILE, ERRFILE -- files for logging
 
 ### *TO CHANGE TRAVERSAL*
@@@@ -105,6 +106,7 @@@@
 $LOGFILENAME = ".wg_log";
 # $STARTFILE = ".wgstart";
 $WGADDSEARCH = ".wgfilter-box";
+$SITECACHE = ".wgsitecache";
 
 $ROBOTNAME = "HTTPGET";
 
@@@@ -187,22 +189,22 @@@@
 
 # Initialize variables to avoid warnings
    ($title, $urlpath, $traverse_type, $explicit_only, $numhops,
-    $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem) = 
-   ('','','','','','','','','','','');
+    $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, $locale, $charset) =
+   ('','','','','','','','','','','','','');
 
 ($title, $urlpath, $traverse_type, $explicit_only, $numhops,
- $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, @@urllist) = ReadConfig($archivepwd);
+ $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, $locale, $charset, @@urllist) = ReadConfig($archivepwd);
 
 # open logs
 &open_logs();
 
 print LOGFILE "From Configuration:\n";
 my(@@configlist) = qw(title urlpath traverse_type explicit_only numhops
-	nhhops local_limit remote_limit addboxes vhost usemaxmem ) ;
+	nhhops local_limit remote_limit addboxes vhost usemaxmem locale charset) ;
 foreach $item (@@configlist) 
 {
 	$value = '';
-	eval "$value = \$$item";
+	eval "\$value = \$$item";
 	print LOGFILE " $item: $value\n";
 }
 print LOGFILE " urllist: @@urllist\n\n";
@@@@ -231,6 +233,7 @@@@
 $MAPFILE = "$archivepwd/$MAPFILE";
 $TEMPROBOTFILE = "$archivepwd/$TEMPROBOTFILE";
 $WGADDSEARCH = "$archivepwd/$WGADDSEARCH";
+$SITECACHE = "$archivepwd/$SITECACHE";
 
 ($archiveprot, $archivehost, $archiveport, $archivepath) = 
    &url::parse_url($archiveurl);
@@@@ -252,7 +255,7 @@@@
 
 # read in the site configuration
 &siteconf::ReadConf($vhost);
-&siteconf::LoadCache();
+&siteconf::LoadCache("$SITECACHE");
 
 ###############
 ### PHASE 1 ###
@@@@ -398,7 +401,7 @@@@
 &close_logs();
 
 # remove the robots file
-system("rm -rf $TEMPROBOTFILE");
+unlink($TEMPROBOTFILE);
 
 #----------------------
 #change the dir back
@@@@ -751,7 +754,7 @@@@
    my($prot, $host, $port, $path) = &url::parse_url($url);
    
    # if the protocol isn't http, assume it's good
-   if($prot!~/http/i){
+   if(!defined($prot) || $prot!~/http/i){
       return 1;
    }
    
@@@@ -800,6 +803,7 @@@@
    my($output);
    my($olddata, $newdata);
    my($newprot, $newhost, $newport, $newpath, $url);
+   my($redcount)=0;
    
    # make the $url
    $url = "http://$host:$port/robots.txt";
@@@@ -815,6 +819,7 @@@@
    while($output ne ""){
       # more for error?
       if($output=~/^error/i){
+	 truncate($TEMPROBOTFILE,0);
 	 print ERRFILE "Error with getting $url\n";
 	 #			print LOGFILE "Error with getting $url\n";
 	 last;
@@@@ -822,7 +827,13 @@@@
       
       # look at output for redirect -- store redirects in file, too
       if($output=~/^Redirect: (.*)$/){
-	 print LOGFILE "Redirected to: $1...";
+	 if ($redcount >= $max_redir) {
+	     truncate($TEMPROBOTFILE,0);
+	     print ERRFILE "Too many redirections with $url\n";
+	     last;
+	 }
+	 $redcount++;
+	 print LOGFILE "Redirected to: $1...\n";
 	 
 	 # see if we have the redirected server
 	 ($newprot, $newhost, $newport, $newpath) = &url::parse_url($1);
@@@@ -843,6 +854,7 @@@@
 	 }
       }else{
 	 # we've got it, or there's an error...
+	 truncate($TEMPROBOTFILE,0);
 	 last;
       }
    }
@@@@ -894,6 +906,7 @@@@
 sub geturl2file{
    my($url) = @@_;
    my($output, $link, $file, $oldfile, @@aliases);
+   my($redcount)=0;
    
    # check if we have that in stock (we know it's not local)
    if (defined($URL2FILE{$url})) {
@@@@ -930,6 +943,7 @@@@
       while($output ne ""){
 	 # more for error?
 	 if($output=~/^error/i){
+	    truncate($file,0);
 	    print ERRFILE "Error with getting $url: $output\n";
 	    #				print LOGFILE "Error with getting $url\n";
 	    last;
@@@@ -937,6 +951,12 @@@@
 	 
 	 # look at output for redirect -- store redirects in file, too
 	 if($output=~/^Redirect: (.*)$/){
+	    if ($redcount >= $max_redir) {
+		truncate($file,0);
+		print ERRFILE "Too many redirections with $url\n";
+		last;
+	    }
+	    $redcount++;
 	    &ungetnewname();	# rewind the name counter		
 				# The next get will overwrite the unnecessary file
 	    
@@@@ -970,6 +990,7 @@@@
 	    }
 	 }else{
 	    # we've got it, or there's an error...
+	    truncate($file,0);
 	    last;
 	 }
       }
@@@@ -1159,6 +1180,15 @@@@
       ($prot, $host, $port, $path) = &url::parse_url($url);
       #print "URL after parsing: $prot://$host:$port$path\n";
       
+      next if !defined($prot);
+      if (!defined($port) ||
+	  ($port eq '80' && $prot =~ /^https?$/) ||
+	  ($port eq '21' && $prot eq 'ftp')) {
+	$port = '';
+      } else {
+	$port = ":$port";
+      }
+
       # make sure the path has a preceding /
       $path = "/$path" if $path!~/^\//;
       
@@@@ -1177,7 +1207,7 @@@@
 #      $host = "$a.$b.$c.$d";
 #      }
       
-      $url = "$prot://$host:$port$path";
+      $url = "$prot://$host$port$path";
       #print "URL after normalization: $url\n";
       
       # strip off any #text
@


1.1
log
@- patch file shuffle for my sanity
@
text
@@

