#!/usr/bin/perl -wT
#
# Extract RDF info from images
# by plh@w3.org
# (c) 2005 World Wide Web Consortium
#
# Largely based on code from the W3C link checker:
#  http://validator.w3.org/checklink
#
# $Id: img2rdf.pl,v 1.6 2008/03/27 12:56:28 plehegar Exp $
#
# This program is licensed under the W3C(r) Software License:
#       http://www.w3.org/Consortium/Legal/copyright-software
#
# The documentation is as follows:
#  imgGrrdl <uri>
#
# An online version is not yet available
#
# Comments and suggestions could be sent to plh@w3.org

use strict;

# Get rid of potentially unsafe and unneeded environment variables.
delete(@ENV{qw(IFS CDPATH ENV BASH_ENV)});
$ENV{PATH} = ''; # undef would output warnings with Perl 5.6.1's Cwd.pm.

# -----------------------------------------------------------------------------

package W3C::UserAgent;

use LWP::RobotUA 1.19 qw();

BEGIN {
 unshift @INC, "/usr/local/Image-ExifTool/lib";
}

use Image::ExifTool qw{:Public :Vars};

@W3C::UserAgent::ISA = qw(LWP::RobotUA);

sub new
{
  my $proto = shift;
  my $class = ref($proto) || $proto;
  my ($name, $from, $rules) = @_;

  # For security/privacy reasons, if $from was not given, do not send it.
  # Cheat by defining something for the constructor, and resetting it later.
  my $from_ok = $from;
  $from ||= 'nowhere@w3.org';
  # WWW::RobotRules <= 5.78 have bugs which cause suboptimal results with
  # User-Agent substring matching against robots.txt files; "User-Agent: *"
  # should work ok with all though, and "User-Agent: W3C-checklink" for >= 5.77
  my $self = $class->SUPER::new($name, $from, $rules);
  $self->from(undef) unless $from_ok;

  $self->env_proxy();
  return $self;
}

sub simple_request
{
  my $self = shift;
  my $response = $self->W3C::UserAgent::SUPER::simple_request(@_);
  if (! defined($self->{FirstResponse})) {
    $self->{FirstResponse} = $response->code();
    $self->{FirstMessage} = $response->message() || '(no message)';
  }
  return $response;
}

sub redirect_ok
{
  my ($self, $request, $response) = @_;
  if ($self->{Checklink_verbose_progress}) {
    # @@@ TODO: when an LWP internal robots.txt request gets redirected,
    # this will a bit confusingly print out info about it.  Would need a
    # robust way of determining whether something is a LWP "internal" request.
    &W3C::IMG2RDF::hprintf("\n%s %s ", $request->method(),$request->uri());
  }
  return 0 unless $self->SUPER::redirect_ok($request, $response);
  if (my $res = &W3C::IMG2RDF::ip_allowed($request->uri())) {
    $response->previous($response->clone());
    $response->request($request);
    $response->code($res->code());
    $response->message($res->message());
    return 0;
  }
  return 1;
}

# -----------------------------------------------------------------------------

package W3C::IMG2RDF;

use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION
            $DocType $Head $Accept $ContentTypes %Cfg);

use HTML::Entities       qw();
use HTTP::Request        qw();
use HTTP::Response       qw();
use Time::HiRes          qw();
use URI                  qw();
use URI::Escape          qw();
use URI::file            qw();
# @@@ Needs also W3C::UserAgent but can't use() it here.

use constant RC_ROBOTS_TXT => -1;
use constant RC_DNS_ERROR  => -2;

BEGIN
{
  # Version info
  $PACKAGE     = 'Image to RDF';
  $PROGRAM     = 'imgGrrdl';
  $VERSION     = '0.0.1';
  $REVISION    = sprintf('version %s (c) 2005 W3C', $VERSION);
  my ($cvsver) = q$Revision: 1.6 $ =~ /(\d+[\d\.]*\.\d+)/;
  $AGENT       = sprintf('%s/%s [%s] %s',
                         $PROGRAM, $VERSION, $cvsver, LWP::RobotUA->_agent());

  # Pull in mod_perl modules if applicable.
  if ($ENV{MOD_PERL}) {
    eval "require Apache2::compat"; # For mod_perl 2
    require Apache;
  }

  my @content_types = qw(video/avi image/bmp application/postscript application/dicom image/gif image/jng image/jpeg2000 image/jpeg application/x-magick-image video/mng video/quicktime audio/mpeg video/mp4 image/x-portable-bitmap application/pdf image/x-portable-graymap image/pict image/png image/x-portable-pixmap application/postscript application/photoshop image/x-quicktime image/tiff audio/x-wav application/xmp);

  $Accept = join(', ', @content_types) . ', */*;q=0.5';
  my $re = join('|', map { s/\+/\\+/g; $_ } @content_types);
  $ContentTypes = qr{\b(?:$re)\b}io;

  #
  # Read configuration.  If the W3C_CHECKLINK_CFG environment variable has
  # been set or the default contains a non-empty file, read it.  Otherwise,
  # skip silently.
  #
  my $defaultconfig = '/etc/w3c/checklink.conf';
  if ($ENV{W3C_CHECKLINK_CFG} || -s $defaultconfig) {

    require Config::General;
    Config::General->require_version(2.06); # Need 2.06 for -SplitPolicy

    my $conffile = $ENV{W3C_CHECKLINK_CFG} || $defaultconfig;
    eval {
      my %config_opts =
        ( -ConfigFile        => $conffile,
          -SplitPolicy       => 'equalsign',
          -AllowMultiOptions => 'no',
        );
      %Cfg = Config::General->new(%config_opts)->getall();
    };
    if ($@) {
      die <<".EOF.";
Failed to read configuration from '$conffile':
$@
.EOF.
    }
  }

  # Trusted environment variables that need laundering in taint mode.
  foreach (qw(NNTPSERVER NEWSHOST)) {
    ($ENV{$_}) = ($ENV{$_} =~ /^(.*)$/) if $ENV{$_};
  }

  # Use passive FTP by default, see Net::FTP(3).
  $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE});
}

# Autoflush
$| = 1;

# Different options specified by the user
my $cmdline = ! ($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/);
my %Opts =
  ( Command_Line      => $cmdline,
    Quiet             => 0,
    Verbose           => 0,
    Timeout           => 60,
    Redirects         => 1,
    Dir_Redirects     => 1,
    HTTP_Proxy        => undef,
    Sleep_Time        => 1,
    User              => undef,
    Password          => undef,
    Trusted           => $Cfg{Trusted},
    Allow_Private_IPs => defined($Cfg{Allow_Private_IPs}) ?
                         $Cfg{Allow_Private_IPs} : $cmdline,
  );
undef $cmdline;

unless ($Opts{Allow_Private_IPs}) {
  eval {
    require Net::IP;
    require Socket;
    Socket->import('inet_ntoa');
    require Net::hostent;
  };
  if ($@) {
    die <<".EOF.";
Allow_Private_IPs is false; this feature requires the Net::IP, Socket, and
Net::hostent modules:
$@
.EOF.
  }
}

# Global variables
# What is our query?
my $query;
# List of redirects
my %redirects;
# Count of the number of documents checked
my $doc_count = 0;
# Time stamp
my $timestamp = &get_timestamp();

&parse_arguments() if $Opts{Command_Line};

# Precompile/error-check regular expressions.
if (defined($Opts{Exclude_Docs})) {
  eval { $Opts{Exclude_Docs} = qr/$Opts{Exclude_Docs}/o; };
  &usage(1, "Error in exclude-docs regexp: $@") if $@;
}
if (defined($Opts{Trusted})) {
  eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; };
  &usage(1, "Error in trusted domains regexp: $@") if $@;
}

my $ua = W3C::UserAgent->new($AGENT); # @@@ TODO: admin address
# @@@ make number of keep-alive connections customizable
$ua->conn_cache({ total_capacity => 1}); # 1 keep-alive connection
$ua->delay($Opts{Sleep_Time}/60);
$ua->timeout($Opts{Timeout});
$ua->proxy('http', 'http://' . $Opts{HTTP_Proxy}) if $Opts{HTTP_Proxy};

if ($Opts{Command_Line}) {

  require Text::Wrap;
  Text::Wrap->import('wrap');

  &usage(1) unless @ARGV;

  &ask_password() if ($Opts{User} && !$Opts{Password});

  my $uri = $ARGV[0];
  if (!$Opts{Summary_Only}) {
#    printf("%s %s\n", $PACKAGE, $REVISION);
  } else {
    $Opts{Verbose} = 0;
    $Opts{Progress} = 0;
  }
  # Transform the parameter into a URI
  $uri = &urize($uri); 
  &image2RDF($uri, 0);
} else {

  require CGI;
  require CGI::Carp;
  CGI::Carp->import(qw(fatalsToBrowser));
  $query = new CGI;
  # Set a few parameters in CGI mode
  $Opts{Verbose}   = 0;
  $Opts{Progress}  = 0;
  $Opts{_Self_URI} = $query->url(-relative => 1);

  # Backwards compatibility
  my $uri = $query->param('uri');

  if (! $uri) {
    &rdf_header();
    &empty_RDF($query);
    exit;
  }

  $Opts{Accept_Language} = undef if $query->param('no_accept_language');

  undef $query; # Not needed any more.

  # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts.
  # If we're under mod_perl, there is a way around it...
  if ($ENV{MOD_PERL}) {
    my $auth = Apache->request()->header_in('Authorization');
    $ENV{HTTP_AUTHORIZATION} ||= $auth if $auth;
  }

  $uri =~ s/^\s+//g;
  if ($uri =~ m/^file:/) {
    # Only the http scheme is allowed
    &file_uri($uri);
  } elsif ($uri !~ m/:/) {
    if ($uri =~ m|^//|) {
      $uri = 'http:'.$uri;
    } else {
      $uri = 'http://'.$uri;
    }
  }

  &image2RDF($uri, 1);
}

###############################################################################

################################
# Command line and usage stuff #
################################

sub parse_arguments ()
{

  require Getopt::Long;
  Getopt::Long->require_version(2.17);
  Getopt::Long->import('GetOptions');
  Getopt::Long::Configure('bundling', 'no_ignore_case');
  my $masq = '';

  GetOptions('help|h|?'        => sub { usage(0) },
             'q|quiet'         => sub { $Opts{Quiet} = 1;
                                        $Opts{Summary_Only} = 1;
                                      },
             'b|broken'        => sub { $Opts{Redirects} = 0;
                                        $Opts{Dir_Redirects} = 0;
                                      },
             'e|dir-redirects' => sub { $Opts{Dir_Redirects} = 0; },
             'v|verbose'       => \$Opts{Verbose},
             'l|location=s'    => \$Opts{Base_Location},
             'u|user=s'        => \$Opts{User},
             'p|password=s'    => \$Opts{Password},
             't|timeout=i'     => \$Opts{Timeout},
             'S|sleep=i'       => \$Opts{Sleep_Time},
             'd|domain=s'      => \$Opts{Trusted},
             'V|version'       => \&version
            )
    || usage(1);

  if ($Opts{Accept_Language} && $Opts{Accept_Language} eq 'auto') {
    $Opts{Accept_Language} = &guess_language();
  }

  if (($Opts{Sleep_Time} || 0) < 1) {
    warn("*** Warning: minimum allowed sleep time is 1 second, resetting.\n");
    $Opts{Sleep_Time} = 1;
  }
}

sub version ()
{
  print "$PACKAGE $REVISION\n";
  exit 0;
}

sub usage ()
{
  my ($exitval, $msg) = @_;
  $exitval = 0 unless defined($exitval);
  $msg ||= ''; $msg =~ s/[\r\n]*$/\n\n/ if $msg;

  die($msg) unless $Opts{Command_Line};

  my $trust = defined($Cfg{Trusted}) ? $Cfg{Trusted} : 'same host only';

  select(STDERR) if $exitval;
  print "$msg$PACKAGE $REVISION

Usage: imgGrrdl <options> <uris>
Options:
 -q, --quiet                No output if no errors are found (implies -s).
 -v, --verbose              Verbose mode.
 -u, --user USERNAME        Specify a username for authentication.
 -p, --password PASSWORD    Specify a password.
 -t, --timeout SECS         Timeout for requests (in seconds).
 -d, --domain DOMAIN        Regular expression describing the domain to which
                            authentication information will be sent
                            (default: $trust).
 -?, -h, --help             Show this message and exit.
 -V, --version              Output version information and exit.

See \"perldoc LWP\" for information about proxy server support,
\"perldoc Net::FTP\" for information about various environment variables
affecting FTP connections and \"perldoc Net::NNTP\" for setting a default
NNTP server for news: URIs.

The W3C_CHECKLINK_CFG environment variable can be used to set the
configuration file to use.  See details in the full manual page, it can
be displayed with: perldoc checklink

Please send bug reports and comments to:
  plh\@w3.org
";
  exit $exitval;
}

sub ask_password ()
{
  eval {
    local $SIG{__DIE__};
    require Term::ReadKey;
    Term::ReadKey->require_version(2.00);
    Term::ReadKey->import(qw(ReadMode));
  };
  if ($@) {
    warn('Warning: Term::ReadKey 2.00 or newer not available, ' .
         "password input disabled.\n");
    return;
  }
  printf(STDERR 'Enter the password for user %s: ', $Opts{User});
  ReadMode('noecho',  *STDIN);
  chomp($Opts{Password} = <STDIN>);
  ReadMode('restore', *STDIN);
  print(STDERR "ok.\n");
}

###############################################################################

###########################################################################
# Guess an Accept-Language header based on the $LANG environment variable #
###########################################################################

sub guess_language ()
{
  my $lang = $ENV{LANG} or return undef;

  $lang =~ s/[\.@].*$//; # en_US.UTF-8, fi_FI@euro...

  return 'en' if ($lang eq 'C' || $lang eq 'POSIX');

  my $res = undef;
  eval {
    require Locale::Language;
    if (my $tmp = Locale::Language::language2code($lang)) {
      $lang = $tmp;
    }
    if (my ($l, $c) = (lc($lang) =~ /^([a-z]+)(?:[-_]([a-z]+))?/)) {
      if (Locale::Language::code2language($l)) {
        $res = $l;
        if ($c) {
          require Locale::Country;
          $res .= "-$c" if Locale::Country::code2country($c);
        }
      }
    }
  };
  return $res;
}

###########################################
# Transform foo into file://localhost/foo #
###########################################

sub urize ($)
{
  my $u = URI->new_abs(URI::Escape::uri_unescape($_[0]), URI::file->cwd());
  return $u->as_string();
}

########################################
# Check for broken links in a resource #
########################################

sub image2RDF ($$$;$)
{

  my ($uri, $header) = @_;

  my $start = &get_timestamp() unless $Opts{Quiet};

  # Get and parse the document
  my $response = &get_image('GET', $uri, \%redirects);

  # Can we check the resource? If not, we exit here...
  return -1 if defined($response->{Stop});

  my $absolute_uri = $response->{absolute_uri}->as_string();

  my $result_anchor = 'results'.$doc_count;

  my $esc_uri = URI::Escape::uri_escape($absolute_uri, "^A-Za-z0-9.");

  my $rdf = &generateRDF($uri, $absolute_uri,
                         $response->content());

  if ($header) {
    print "Content-Type: application/rdf+xml; charset=utf-8\n";
    print "Content-Length: ".length($rdf)."\n\n";
  }
  print $rdf;

  return 0;
}


sub generateRDF ($$$;\%)
{
  my ($uri, $location, $imageData) = @_;

  my $exiftool = new Image::ExifTool;

  my $outBuf = '';
  my $value = '';

  $exiftool->SetNewValuesFromFile(\$imageData);
  $exiftool->WriteInfo(undef, \$outBuf, 'XMP');

  $outBuf = clean_XMP($outBuf, $location);

  $value = $exiftool->GetValue("XMP", "Raw");
  if ($value) {
    # do something
    $outBuf .= clean_XMP($value, $location);
  }

  # ok, now we have a clean outBuf, let's see if we can add more info
  $exiftool->ExtractInfo(\$imageData);
  my @foundTags = $exiftool->GetFoundTags('File');  
         
  my @tagList = Image::ExifTool::GetWritableTags();
  my %is_writable;
  for (@tagList) { $is_writable{$_} = 1 }
 
  my $extra = "";

  foreach (@foundTags) {
    if ($is_writable{$_} || / \([0-9]\)/) {
      # skip, it's a duplicate anyway
    } elsif ($exiftool->GetValue($_)) {
      my $value = $exiftool->GetValue($_);
      if ($_ eq "MIMEType") {
        $_ = $outBuf;
        if (! /<dc:format>/) {
          $extra .= "  <dc:format>".$value."</dc:format>\n";
        }
      } elsif (ref $value) {
          # skip, it's something strange
      } else {
#        print $_.": ".$value."\n";
      }
    }     
  }

  $value = $exiftool->GetValue("Copyright");
  if ($value) {
    my $group = $exiftool->GetGroup("Copyright");
    if ($group eq "ICC_Profile") {
      # issue with exitTool 5.87. Remove the Copyright information
      $outBuf =~ s|[ ]*<tiff:Copyright>[ \n]+<rdf:Alt>[ \n]+<rdf:li xml:lang='x-default'>[^<]+</rdf:li>[ \n]+</rdf:Alt>[ \n]+</tiff:Copyright>\n*||;
      $value = $exiftool->GetValue("Copyright (1)");
    }
    $_ = $outBuf;
    if ($value and ! /<dc:rights/) {
      $extra .= "  <dc:rights xml:lang='x-default'>$value</dc:rights>\n";
    }
  }

  # Fix keywords
  my $keywords = $exiftool->GetValue("Keywords", "PrintConv");
  if ($keywords) {
    $outBuf =~ s|<pdf:Keywords>([^<]+)</pdf:Keywords>|<pdf:Keywords>$keywords</pdf:Keywords>|;
    $_ = $outBuf;
    if (! /<dc:subject/) {
      my @keywords = $exiftool->GetValue("Keywords", "ValueConv");
      my $buf = "  <dc:subject>\n   <rdf:Bag>\n";
      foreach (@keywords) { $buf .=  "    <rdf:li>$_</rdf:li>\n" }
      $buf .= "   </rdf:Bag>\n  </dc:subject>\n";
      $extra .= $buf;
    }
  }
  if (length($extra) > 0) {
    $outBuf .= " <rdf:Description rdf:about='".$location."'
         xmlns:dc='http://purl.org/dc/elements/1.1/'>
".$extra." </rdf:Description>\n";
  }

  my @fTags;
  push @fTags, "XMP";
  my $info = $exiftool->ImageInfo(\$imageData, \@fTags);

  return "<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'>
".$outBuf."</rdf:RDF>\n";
}

sub clean_XMP($$$;\%)
{
  my ($xmp, $location) = @_;

  # clean up
  $xmp =~ s/<\?xpacket[^>]+>//g;
  $xmp =~ s/<x:xmpmeta[^>]+>//g; 
  $xmp =~ s|</x:xmpmeta>||g; 
  $xmp =~ s|<exif:UserComment>[ \n]*<rdf:Alt>[ \n]*<rdf:li xml:lang='x-default'></rdf:li>[ \n]*</rdf:Alt>[ \n]*</exif:UserComment>||;
  $xmp =~ s|<aux:OwnerName></aux:OwnerName>||;
  
  $_ = $xmp;
  my $xmlns = '';
  while (/(xmlns:[^=]+='[^']+')/ || /(xmlns:[^=]+="[^"]+")/) {
    my $rest = $';
    $_ = $1;
    if (! /^xmlns:rdf=/ && ! /^xmlns:x=/) {
      $xmlns.="\n         ".$_;
    }
    $_ = $rest;
  }

  $xmp =~ s/<rdf:RDF[^>]+>//g; 
  $xmp =~ s|</rdf:RDF>||g;
  $xmp =~ s/<rdf:Description[^>]+>//g;
  $xmp =~ s|</rdf:Description>||g;
  $xmp =~ s/\s+\n/\n/g;
  $xmp =~ s/\n\n+/\n/g;

  $xmp =~ s|<Iptc4xmpCore:CiUrlWork>([^<]+)</Iptc4xmpCore:CiUrlWork>|<Iptc4xmpCore:CiUrlWork rdf:resource='$1'/>|;
  $xmp =~ s|<Iptc4xmpCore:CiEmailWork>([^<]+)</Iptc4xmpCore:CiEmailWork>|<Iptc4xmpCore:CiEmailWork rdf:resource='mailto:$1'/>|;
  $xmp =~ s|<xmpRights:WebStatement>([^<]+)</xmpRights:WebStatement>|<xmpRights:WebStatement rdf:resource='$1'/>|;
  $xmp =~ s|<cc:license>([^<]+)</cc:license>|<cc:license rdf:resource='$1'/>|;
  $xmp =~ s|>[^<]+<rdf:Alt>[^<]+<rdf:li xml:lang=["']([^"']+)["']>([^<]+)</rdf:li>[^<]+</rdf:Alt>[^<]+<| xml:lang="$1">$2<|mg;
  $xmp =~ s|>[^<]+<rdf:Seq>[^<]+<rdf:li>([^<]+)</rdf:li>[^<]+</rdf:Seq>[^<]+<|>$1<|mg;
  $xmp =~ s|>[^<]+<rdf:Bag>[^<]+<rdf:li>([^<]+)</rdf:li>[^<]+</rdf:Bag>[^<]+<|>$1<|mg;
  return " <rdf:Description rdf:about='".$location."'".$xmlns.">".$xmp." </rdf:Description>\n";
}

#######################################
# Get and parse a resource to process #
#######################################

sub get_image ($$$;\%)
{
  my ($method, $uri, $redirects) = @_;
  # $method contains the HTTP method the use (GET or HEAD)
  # $uri contains the identifier of the resource
  # $redirects is a pointer to the hash containing the map of the redirects

  # Get the resource
  my $response;
  $response = &get_uri($method, $uri);

  if (! $response->is_success()) {
    if ($response->code() == 401) {
      &authentication($response);
    } else {
      &hprintf("\nError: %d %s\n",
               $response->code(), $response->message() || '(no message)');
    }
    $response->{Stop} = 1;
    return($response);
  }

  # What is the URI of the resource that we are processing by the way?
  my $base_uri = URI->new($response->base());
  my $request_uri = URI->new($response->request->url);
  $response->{absolute_uri} = $request_uri->abs($base_uri);

  # Can we parse the document?
  my $failed_reason;
  my $ct = $response->header('Content-Type');
  my $ce = $response->header('Content-Encoding');
  if (!$ct || $ct !~ $ContentTypes) {
    $failed_reason = "Content-Type for <$request_uri> is " .
      (defined($ct) ? "'$ct'" : 'undefined');
  } elsif (defined($ce) && $ce ne 'identity') {
    # @@@ We could maybe handle gzip...
    $failed_reason = "Content-Encoding for <$request_uri> is '$ce'";
  }
  if ($failed_reason) {
    # No, there is a problem...
    &hprintf("Can't generate RDF from %s.\n", $failed_reason);
    $response->{Stop} = 1;
  }

  # Ok, return the information
  return($response);
}

############################
# Get the content of a URI #
############################

sub get_uri ($$;$\%$$$$)
{
  # Here we have a lot of extra parameters in order not to lose information
  # if the function is called several times (401's)
  my ($method, $uri, $start, $redirects, $code, $realm, $message, $auth) = @_;

  # $method contains the method used
  # $uri contains the target of the request
  # $start is a timestamp (not defined the first time the function is
  #                        called)
  # $redirects is a map of redirects
  # $code is the first HTTP return code
  # $realm is the realm of the request
  # $message is the HTTP message received
  # $auth equals 1 if we want to send out authentication information

  # For timing purposes
  $start = &get_timestamp() unless defined($start);

  # Prepare the query

  my $request = new HTTP::Request($method, $uri);
  $request->header('Accept-Language' => $Opts{Accept_Language})
    if $Opts{Accept_Language};
  $request->header('Accept', $Accept);
  # Are we providing authentication info?
  if ($auth && $request->url()->host() =~ $Opts{Trusted}) {
    if (defined($ENV{HTTP_AUTHORIZATION})) {
      $request->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION});
    } elsif (defined($Opts{User}) && defined($Opts{Password})) {
      $request->authorization_basic($Opts{User}, $Opts{Password});
    }
  }

  # Check if the IP address is allowed.
  my $response = &ip_allowed($request->uri());
  return $response if $response;

  # Do the query
  $response = $ua->request($request);

  # Get the results
  # Record the very first response
  if (! defined($code)) {
    ($code, $message) = delete(@$ua{qw(FirstResponse FirstMessage)});
  }
  # Authentication requested?
  if ($response->code() == 401 &&
      !defined($auth) &&
      (defined($ENV{HTTP_AUTHORIZATION})
       || (defined($Opts{User}) && defined($Opts{Password})))) {

    # Set host as trusted domain unless we already have one.
    if (!$Opts{Trusted}) {
      my $re = sprintf('^%s$', quotemeta($response->base()->host()));
      $Opts{Trusted} = qr/$re/io;
    }

    # Deal with authentication and avoid loops
    if (! defined($realm)) {
      $response->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/;
      $realm = $1;
    }
    return &get_uri($method, $response->request()->url(),
                    $start, $redirects, $code, $realm, $message, 1);
  }

  $response->{Realm} = $realm if defined($realm);

  return $response;
}

sub escape_match ($\%)
{
  my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]);
  foreach my $b (keys %$hash) {
    return 1 if ($a eq URI::Escape::uri_unescape($b));
  }
  return 0;
}

##########################
# Ask for authentication #
##########################

sub authentication ($)
{
  my $r = $_[0];
  $r->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/;
  my $realm = $1;
  $realm = '' unless defined($realm);

  if ($Opts{Command_Line}) {
    printf STDERR <<EOF, $r->request()->url(), $realm;

Authentication is required for %s.
The realm is "%s".
Use the -u and -p options to specify a username and password and the -d option
to specify trusted domains.
EOF
  } else {

    printf("Status: 401 Authorization Required\nWWW-Authenticate: %s\nConnection: close\nContent-Type: application/rdf+xml; charset=utf-8\n\n", $r->headers->www_authenticate);

  print "<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'>
</rdf:RDF>";


    if ($Opts{Trusted}) {
      printf <<EOF, &encode($Opts{Trusted});
<!--  This service has been configured to send authentication only to hostnames
  matching the regular expression %s -->
EOF
    }

    print "\n";
  }
}

##################
# Get statistics #
##################

sub get_timestamp ()
{
  return pack('LL', Time::HiRes::gettimeofday());
}

sub time_diff ($$)
{
  my @start = unpack('LL', $_[0]);
  my @stop = unpack('LL', $_[1]);
  for ($start[1], $stop[1]) {
    $_ /= 1_000_000;
  }
  return(sprintf("%.1f", ($stop[0]+$stop[1])-($start[0]+$start[1])));
}

########################
# Handle the redirects #
########################

# Record the redirects in a hash
sub record_redirects (\%$)
{
  my ($redirects, $response) = @_;
  for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) {
    $redirects->{$prev->request()->url()} = $response->request()->url();
  }
}

# Determine if a request is redirected
sub is_redirected ($%)
{
  my ($uri, %redirects) = @_;
  return(defined($redirects{$uri}));
}

# Get a list of redirects for a URI
sub get_redirects ($%)
{
  my ($uri, %redirects) = @_;
  my @history = ($uri);
  my %seen = ($uri => 1); # for tracking redirect loops
  my $loop = 0;
  while ($redirects{$uri}) {
    $uri = $redirects{$uri};
    push(@history, $uri);
    if ($seen{$uri}) {
      $loop = 1;
      last;
    } else {
      $seen{$uri}++;
    }
  }
  return ($loop, @history);
}

####################################################
# Tool for sorting the unique elements of an array #
####################################################

sub sort_unique (@)
{
  my %saw;
  @saw{@_} = ();
  return (sort { $a <=> $b } keys %saw);
}

#
# Checks whether we're allowed to retrieve the document based on it's IP
# address.  Takes an URI object and returns a HTTP::Response containing the
# appropriate status and error message if the IP was disallowed, undef
# otherwise.  URIs without hostname or IP address are always allowed,
# including schemes where those make no sense (eg. data:, often javascript:).
#
sub ip_allowed ($)
{
  my ($uri) = @_;
  return undef if $Opts{Allow_Private_IPs}; # Short-circuit

  my $hostname = undef;
  eval { $hostname = $uri->host() }; # Not all URIs implement host()...
  return undef unless $hostname;

  my $addr = my $iptype = my $resp = undef;
  if (my $host = Net::hostent::gethostbyname($hostname)) {
    $addr = inet_ntoa($host->addr()) if $host->addr();
    if ($addr && (my $ip = Net::IP->new($addr))) {
      $iptype = $ip->iptype();
    }
  }
  if ($iptype && $iptype ne 'PUBLIC') {
    $resp = HTTP::Response->new(403,
    'Checking non-public IP address disallowed by img2rdf configuration');
  }
  return $resp;
}

###############################################################################

################
# Global stats #
################

sub global_stats ()
{
  my $stop = &get_timestamp();
  my $n_docs =
    ($doc_count <= $Opts{Max_Documents}) ? $doc_count : $Opts{Max_Documents};
  return sprintf('Checked %d document%s in %s seconds.',
                 $n_docs,
                 ($n_docs == 1) ? '' : 's',
                 &time_diff($timestamp, $stop));
}

##################
# RDF interface #
##################

sub rdf_header ()
{
  # mod_perl 1.99_05 doesn't seem to like if the "\n\n" isn't in the same
  # print() statement as the last header...

  my $headers = '';
  if (! $Opts{Command_Line}) {
    $headers .= "Content-Type: application/rdf+xml; charset=utf-8\n\n";
  }

  print $headers;
}

sub file_uri ($)
{
  my ($uri) = @_;
  &rdf_header();
  empty_RDF();
  exit;
}

sub empty_RDF ()
{
  print "<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'>
</rdf:RDF>
";
}

sub encode (@)
{
  return HTML::Entities::encode(@_);
}

sub hprintf (@)
{
  if (! $Opts{HTML}) {
    printf(@_);
  } else {
    print HTML::Entities::encode(sprintf($_[0], @_[1..@_-1]));
  }
}

# Local Variables:
# mode: perl
# indent-tabs-mode: nil
# tab-width: 2
# perl-indent-level: 2
# End:
# ex: ts=2 sw=2 et
