#!/usr/bin/perl
use strict;
use warnings;
use diagnostics;
use File::Basename;
use DBI;
use LWP::UserAgent;

sub wantMeToWhat { die "pick a file and try again" if -t STDIN and not @ARGV; }

#mysql -pxlsf01 -hsirdave.com -usirdavec_a sirdavec_articles
my $dbh = DBI->connect('DBI:mysql:sirdavec_articles;host=sirdave.com', 'sirdavec_a', 'xlsf01',
		       { RaiseError => 1 })
  or die "Cannot connect: " . $DBI::errstr;

my $saved=0;
my $skipped=0;
my $inserted=0;
my $quiet=0;
my $table='articles';
my $MAX_CONTENT=5000000;
my $MAX_TITLE_DISPLAY=90;
my $MAX_TITLE=600;
my $MAX_URL_LEN=9000;

count($table) if table_exists($table);

binmode STDOUT, ':utf8';

my @urls=@ARGV;

if (!@urls) { if (-t STDIN) { *ARGV=*DATA; } else { @urls=<>; } }

foreach my $url (@urls) {
  chomp $url;
  next if $url =~ /#######/;

  if ($url =~ /^-q$/) {
    $quiet=1;
    next;
  }

  $saved=0;
  if ($url =~ /pdf$/i) {
    $saved=getPDF($url);
  }

  last if $url =~ /__END__/;    #Perl should do this.

  $url=parseUrl($url);

  my $stmt=sprintf "SELECT id,size FROM %s WHERE url='%s'", $table, $url;
  my($id, $size)=reap($stmt);

  my $inDb=0;
  if($id) {
    $inDb=1;
    printf "%s: %s %s\n", $id, $size, $url;
  } else {
    $inDb=insert($url);
  }
  my $stash=saveUrl($url,$inDb);
  printf "Stashed uri in %s\n", $stash if -f $stash;
}

count($table);

printf "%ss: %s urls skipped, %s inserted.\n", (time-$^T), $skipped, $inserted;
exit 0;


sub parseUrl {
  my($uri, @rest)=@_;

  if (length($uri) > $MAX_URL_LEN) {
    printf "url too long: %s bytes\n", length($uri);
    return 0;
  }

  return 1 if ($uri =~ /^-q$/);	#shh

  $uri =~ s/^'//;	       #In case of embedded # on command line.
  $uri =~ s/'$//;
  $uri =~ s/^\s+//;		#Strip leading spaces
  $uri =~ s/\s+$//;		#Strip trailing spaces
  return $uri;
}

sub table_exists {
  my($table, $loud)=@_;
  my $stmt=sprintf "SHOW TABLES like \'%%%s%%\'", $table;

  my @tables=reap($stmt);
  my $t;
  foreach $t (@tables) {
    if ($t =~ /$table/) {
      printf "Found table %s\n", $t if $loud;
      return $t;
    }
  }
  return init($t);
}


sub init {
  my $newsize=$_;
  $MAX_CONTENT=$newsize if ($newsize);

  drop_table($table);

  #Put content in its own table with id as key:
  my $stmt=sprintf "CREATE TABLE %s ( id MEDIUMINT NOT NULL AUTO_INCREMENT,
                          url VARCHAR(512) UNIQUE NOT NULL,
                          title VARCHAR(200),
                          content VARCHAR(%d),
                          size MEDIUMINT,
                          PRIMARY KEY (id)
    )", $table, $MAX_CONTENT;

  my $result=sow($stmt);
  return $result or 1;
}


sub sow {
  my($statement, $loud)=@_;
  printf "sowing %s\n", $statement if $loud;

  my $sthandle=$dbh->prepare($statement) or die $dbh->errstr;
  my $result=$sthandle->execute();

  my $e=$dbh->errstr;
  swing_into_action($e) if length($e);
  return $result;
}


sub reap {
  my($statement, $loud)=@_;
  printf "reaping: %s\n", $statement if $loud;

  my $sth=$dbh->prepare($statement); # prepare the query
  $sth->execute();		     # execute the query

  my @vals=$dbh->selectrow_array($statement);

  my $e=$dbh->errstr;
  swing_into_action($e) if ($e);

  return @vals;
}


sub harvest {
  my($statement, $loud)=@_;
  #printf "reaping: %s\n",$statement if $loud;

  my $data_ref=$dbh->selectall_arrayref($statement, { 'Columns' => {} });
  if (!$quiet) {
    for my $row (@$data_ref) {
      print "$_ => $row->{$_}\n" for keys %$row;
    }
  }

  my $e=$dbh->errstr;
  swing_into_action($e) if length($e);
  return $data_ref;
}


sub swing_into_action {
  my $err=$_;
  if ($err) {
    die $err if ($err =~ /Specified key was too long/);
    warn $err;

    if (($err =~ /Table.*doesn't exist/)
	or ($err =~ /Unknown table/)
	or ($err =~ /Unknown column 'file'/)) {
      init($table);
    }
    die $err;
  }
}


sub count {
  my($table, $loud)=@_;
  my $stmt=sprintf 'SELECT COUNT(id) FROM %s', $table;
  my @counts=reap($stmt);
  foreach my $count (@counts) {
    next unless $count =~ /^\d+$/;
    printf "%s ids found.\n", $count if $loud;
    return $count;
  }
  return -1;
}


sub insert {
  my($url, @rest)=@_;
  return 0 if ($url eq '');

  my $ua=new LWP::UserAgent;
  $ua->agent("Mozilla/8.0");    #Fake out Ceasefire mag.

  my $req=new HTTP::Request 'GET' => "$url";
  $req->header('Accept' => 'text/html');

  # send request
  my $error;
  my $resp=$ua->request($req);
  my $content=$resp->decoded_content;

  # check the outcome
  if ($resp->is_success) {
    #Save to file for cf. to db content.
    #print $resp->decoded_content;
  } else {
    #Make an array of status codes and loop it.
    $error=$resp->status_line . "\n";
    if ($error) {
      chomp $error;
      if ($error =~ /301/) {
      }

      if ($error =~ /302 Found/) {
	printf "Curse ye NYT\nYARR\n";
      }

      if ($error =~ /400 URL must be absolute/) {
      }
      if ($error =~ /403 Forbidden/) {
      }
      if ($error =~ /404.*Not Found/) {
      }
      if ($error =~ /406 Not Acceptable/) {
      }
      if ($error =~ /500 Can't connect/) {
      }
      if ($error =~ /501 Protocol scheme/) {
	printf "'https' is not supported (LWP::Protocol::https not installed)\n";
      }
      printf "Error: %s\n", $error;
      return $error;
    }
    
    if ($error =~ /503: Service Unavailable:/) {
      # http://www.zdnet.com/news/spy-agency-taps-into-undersea-cable/115877
    }
  }

  my $size=length($content);

  if ($size == 0) {
    $skipped++;
    return 0;
  } else {
    #printf STDERR "Got %s bytes at %s\n", $size, $url;
    if ($size > $MAX_CONTENT) {
      init($size);
    }
  }

  my $title=$content;

  #This takes forever with a pdf.
  #Find a way to adapt title search by MIME type?

  if ($title =~ s/^.*<title>//imsx) {
    #printf STDERR "Found title..";
  } else {
    printf STDERR "Found no title tag\n";
    $title=$url;
  }

  if ($title =~ s#</title>.*##imsx) {
    #printf STDERR ". and title end... ";
  } else {
    printf STDERR "...anywhere.  ";
  }

  $title =~ s/^\s+//;
  $title =~ s/\s+$//;

  my $l=length($title);

  if ($l == 0) {
    warn "No title!";
  }

  if ($l > $MAX_TITLE) {
    printf "%s is too big for a title.  Next!\n%s\n", $l, $title;
    return -1;
  }

  if ($l > $MAX_TITLE_DISPLAY) {
    $title=substr($title, 0, $MAX_TITLE_DISPLAY-$l);
  }

  $url=$dbh->quote($url);
  $content=$dbh->quote($content);
  $title=$dbh->quote($title);

  #Placeholders are your friends.
  my $st=
    sprintf "INSERT INTO %s (url,title,content,size) VALUE (%s,%s,%s,%s)",
      $table, $url, $title, $content, $size; #Size of unquoted content.

  $inserted += sow($st);

  $st=sprintf "SELECT `id`,`title`,`url` FROM %s WHERE `url`=%s", $table, $url;
  my $sth=$dbh->prepare($st) or die $dbh->errstr;
  $sth->execute();

  my $id;
  ($id, $title, $url, @rest)=reap($st);

  my $len=length($title);
  printf "Id: %s  ",     $id   if ($id);
  printf "Content: %s ", $size if ($size);
  printf "Title: %s\n",  $len  if ($len);
  printf "%s\n", $url;

  return 1;
}


sub drop_table {
  my($t, @rest)=@_;
  my $smt=sprintf "DROP TABLE IF EXISTS %s", $t;
  printf "Dropping table %s\n", $t;
  sow($smt);
  return 0;
}


sub list_articles {
  my $st="SELECT `id`,`size`,`title` FROM " . $table;
  my $loud=1;
  my $dref=harvest($st, $loud);

  for my $row (@$dref) {
    for (keys %$row) {
      $row->{$_} =~ s/^.*<title>//im;
      $row->{$_} =~ s^<title/>.*$^^im;
      print "$_ => $row->{$_}\n" unless $quiet;
    }
  }
  exit 0;
}


sub saveUrl {
  my($uri,$inDb)=@_;
  chomp $uri;
  my $ustash="$ENV{DOCUMENTS}";
  die "DOCUMENTS not defined\n" unless -d $ustash;

  if ($inDb) {
    $ustash .= '/urls.remote.txt';
  } else {
    $ustash .= '/urls.unused.remote.txt';
  }

  my $I; #Earache

  my @uris=();
  if (open($I, "<", $ustash)) {
    while(<$I>){
      chomp;
      push @uris, $_;
    }
  }

  my $skipit=0;
  foreach my $u (@uris) {
    if ($u eq $uri) {
      $skipit=1;
      return '';
    }
  }
  close $I;

  push(@uris,$uri);
  open(my $O, ">", $ustash) or die $!;
  foreach my $u (@uris) {
    printf $O "%s\n", $u;
  }
  close $O;

  svn($ustash);
  return $ustash;
}

sub svn {
  my $stash=shift;
  die $! unless -f $stash;

  my $myname=basename($0);
  my $cmd=sprintf 'svn ci -m "%s" %s >/dev/null 2>&1', qq/$myname/, $stash;
  #printf "%s\n", $cmd;
  my $result=`$cmd`;
  printf "%s\n", $result;

  #$cmd=sprintf "svn info %s", $stash;
  #printf "%s\n", $cmd;
  #$result=`$cmd`;
  #printf "%s\n", $result;
}

sub getPDF {
  my($uri, @rest)=@_;
  my $pdffile=basename($uri);
  $pdffile=qq/$pdffile/;
  $pdffile =~ s/%20/ /g;

  if (-f "$pdffile" ) {
    #system("/usr/bin/evince $pdffile &");
    return 0;
  }

  my $ua=new LWP::UserAgent;
  $ua->agent("Mozilla/8.0");    #Fake out Ceasefire mag.

  my $req=new HTTP::Request 'GET' => "$uri";
  $req->header('Accept' => 'application/pdf');


  # send request
  my $error;
  my $resp=$ua->request($req);
  my $content=$resp->decoded_content;

  # check the outcome
  if ($resp->is_success) {

    open my $P, ">",  $pdffile or die $!; #s/b binary mode?
    binmode $P, ':utf8';
    printf $P "%s\n", $resp->decoded_content;
    close $P;

    die $! if (!-f $pdffile);
    #Save to file for cf. to db content.
    #print $resp->decoded_content;
    return 1;
  }
  return 0;
}

__DATA__
http://news.cnet.com/8301-31921_3-20027893-281.html
http://cryptome.org/2013/06/dhs-prism.pdf
http://www.theonion.com/articles/new-liver-complains-of-difficulty-working-with-lou,32669/
http://www.nytimes.com/2009/08/18/science/18dna.html
http://www.sfgate.com/news/article/Wife-Lou-Reed-recovering-after-liver-transplant-4568197.php
http://blacklistednews.com/?news_id=26422&print=1
http://ceasefiremagazine.co.uk/whistleblower-al-qaeda-chief-u-s-asset/index.php
http://defnesumanblogs.com/2013/06/01/what-is-happenning-in-istanbul/
http://en.wikipedia.org/wiki/Mapreduce
http://ergoemacs.org/emacs/keyboard_shortcuts.html
http://forums.mysql.com/read.php?98,234850,234850
'http://online.wsj.com/article/SB10001424127887323335404578444682892520530.html#printMode'
http://perl.plover.com/yak/hw1/samples/slide032.html
http://rinf.com/alt-news/breaking-news/yahoo-to-users-let-us-read-your-emails-or-goodbye/38190/
http://sedition.com/perl/perl-colorizer.html
https://en.wikipedia.org/wiki/File:Global_networks_front_cover.jpg
https://en.wikipedia.org/wiki/Mapreduce
http://sourceforge.net/apps/mediawiki/csound/index.php?title=Score_Preprocessing
http://stackoverflow.com/posts/4831965/revisions
http://thechart.blogs.cnn.com/2011/11/14/high-iq-linked-to-drug-use/
http://web.archive.org/web/20130420133850/http://911review.org/brad.com/batcave/
http://www.alternet.org/print/meet-doctor-big-pharma-cant-shut
ttp://www.apnewsarchive.com/1998/Americans-Warned-of-Retaliation/id-d2861c55856d0ca36a96adb1e73dd0b3
http://www.citywatchla.com/4box-right/5005-humans-used-for-radiation-experiments-a-shameful-chapter-in-us-history
http://www.devshed.com/c/a/MySQL/Error-Handling/2/
http://www.easysoft.com/developer/languages/perl/dbd_odbc_tutorial_part_2.html
http://www.emacswiki.org/emacs/HtmlRendering
http://www.howtoforge.com/mysql_database_replication
http://www.huffingtonpost.com/kristen-breitweiser/hey-americacan-you-hear-m_b_3396464.html
http://www.huffingtonpost.com/randy-fox/drive-ins-of-the-rust-belt-photos_b_1022608.html
http://www.informationclearinghouse.info/article35190.htm
http://www.newscientist.com/channel/mech-tech/mg18725126.300
http://www.perlmonks.org/?node_id=206756
http://www.perlmonks.org/?node_id=387965
http://www.stanford.edu/class/ee368b/Handouts/11-TransformCoding.pdf
http://www.uncut.co.uk/blog/uncut-editors-diary/robert-fripp-steals-the-show-in-david-bowie-documentary
http://www.w3cyberlearnings.com/MySQL_Stored_Procedure_for_Select,_Insert,_Update,_and_Delete
http://www.webdeveloper.com/forum/showthread.php?259293-How-to-save-url-as-text-file-in-chrome-using-javascript
http://www.xaprb.com/blog/2006/04/30/how-to-optimize-subqueries-and-joins-in-mysql/
http://www.xaprb.com/blog/2006/06/23/how-to-select-from-an-update-target-in-mysql/
http://blacklistednews.com/?news_id=26422&print=1
http://ceasefiremagazine.co.uk/whistleblower-al-qaeda-chief-u-s-asset/index.php
http://defnesumanblogs.com/2013/06/01/what-is-happenning-in-istanbul/
http://discussion.guardian.co.uk/comment-permalink/24144675
http://en.wikipedia.org/wiki/Mapreduce
http://ergoemacs.org/emacs/keyboard_shortcuts.html
http://forums.mysql.com/read.php?98,234850,234850
http://ggsidedocs.blogspot.com.br/2013/01/frequently-told-lies-ftls.html
http://mashable.com/2013/06/08/prism-nsa-direct-access/
http://online.wsj.com/article/SB10001424127887323335404578444682892520530.html	#printMode
http://perl.plover.com/yak/hw1/samples/slide032.html
http://rinf.com/alt-news/breaking-news/yahoo-to-users-let-us-read-your-emails-or-goodbye/38190/
http://sedition.com/perl/perl-colorizer.html
http://sourceforge.net/apps/mediawiki/csound/index.php?title=Score_Preprocessing
http://stackoverflow.com/posts/4831965/revisions
http://techcrunch.com/2013/06/08/cooperation-methods-protected-innocents-from-prism/
http://thechart.blogs.cnn.com/2011/11/14/high-iq-linked-to-drug-use/
http://web.archive.org/web/20130420133850/http://911review.org/brad.com/batcave/
http://web1.nusd.k12.az.us/schools/nhs/gthomson.class/articles/executive/BUSH.SIGNED.NSA.ORDER.PDF
http://wikileaks.org/wiki/Perry_Fellwock
http://www.alternet.org/print/meet-doctor-big-pharma-cant-shut
ttp://www.apnewsarchive.com/1998/Americans-Warned-of-Retaliation/id-d2861c55856d0ca36a96adb1e73dd0b3
http://www.citywatchla.com/4box-right/5005-humans-used-for-radiation-experiments-a-shameful-chapter-in-us-history
http://www.devshed.com/c/a/MySQL/Error-Handling/2/
http://www.dominikgorecki.com/2013/06/in-response-to-the-supporters-of-prism/
http://www.easysoft.com/developer/languages/perl/dbd_odbc_tutorial_part_2.html
http://www.emacswiki.org/emacs/HtmlRendering
http://www.guardian.co.uk/commentisfree/2013/jun/11/nsa-surveillance-us-behaving-like-china
http://www.guardian.co.uk/world/2013/jun/06/national-security-agency-surveillance
http://www.guardian.co.uk/world/2013/jun/09/edward-snowden-nsa-whistleblower-surveillance
http://www.howtoforge.com/mysql_database_replication
http://www.huffingtonpost.com/kristen-breitweiser/hey-americacan-you-hear-m_b_3396464.html
http://www.huffingtonpost.com/randy-fox/drive-ins-of-the-rust-belt-photos_b_1022608.html
http://www.informationclearinghouse.info/article35190.htm
http://www.mrtom.com/quotes/huxley_essay.htm
http://www.newscientist.com/channel/mech-tech/mg18725126.300
http://www.perlmonks.org/?node_id=206756
http://www.perlmonks.org/?node_id=387965
http://www.sfgate.com/news/article/Wife-Lou-Reed-recovering-after-liver-transplant-4568197.php
http://www.stanford.edu/class/ee368b/Handouts/11-TransformCoding.pdf
http://www.theatlantic.com/technology/archive/2013/06/why-should-we-even-care-if-the-government-is-collecting-our-data/276732/
http://www.theonion.com/articles/new-liver-complains-of-difficulty-working-with-lou,32669/
http://www.theverge.com/2013/6/8/4409746/nsas-boundless-informant-software-leaked
http://www.uncut.co.uk/blog/uncut-editors-diary/robert-fripp-steals-the-show-in-david-bowie-documentary
http://www.w3cyberlearnings.com/MySQL_Stored_Procedure_for_Select,_Insert,_Update,_and_Delete
http://www.webdeveloper.com/forum/showthread.php?259293-How-to-save-url-as-text-file-in-chrome-using-javascript
http://www.xaprb.com/blog/2006/04/30/how-to-optimize-subqueries-and-joins-in-mysql/
http://www.xaprb.com/blog/2006/06/23/how-to-select-from-an-update-target-in-mysql/
https://en.wikipedia.org/wiki/File:Global_networks_front_cover.jpg
https://en.wikipedia.org/wiki/Mapreduce
https://papers.ssrn.com/sol3/Delivery.cfm/SSRN_ID1098449_code249137.pdf?abstractid=998565
https://papers.ssrn.com/sol3/papers.cfm?abstract_id=998565
https://www.eff.org/deeplinks/2013/06/director-national-intelligences-word-games-explained-how-government-deceived
https://www.eff.org/deeplinks/2013/06/government-says-secret-court-opinion-law-underlying-prism-program-needs-stay
https://www.nytimes.com/2011/10/07/us/politics/white-house-orders-new-computer-security-rules.html
http://www.guardian.co.uk/world/video/2013/jun/09/nsa-whistleblower-edward-snowden-interview-video
http://electrospaces.blogspot.nl/2013/06/are-nsas-prism-slides-photoshopped.html
http://news.cnet.com/8301-13578_3-57588337-38/no-evidence-of-nsas-direct-access-to-tech-companies/
http://abcnews.go.com/Technology/nsa-prism-dissecting-technology-companies-adamant-denial-involvement/story?id=19350095
http://news.cnet.com/8301-13578_3-57587005-38/justice-department-tries-to-force-google-to-hand-over-user-data/
http://www.rand.org/pubs/technical_reports/TR434.html
http://www.dni.gov/index.php/newsroom/press-releases/191-press-releases-2013/869-dni-statement-on-activities-authorized-under-section-702-of-fisa
http://www.independent.co.uk/news/world/americas/the-us-bank-and-the-secret-plan-to-destroy-wikileaks-2215059.html
http://www.forbes.com/sites/timworstall/2013/06/07/nsas-prism-sounds-like-a-darn-good-idea-to-me-this-is-what-governments-are-for/
http://news.yahoo.com/nsa-prism-leak-much-less-seems-141000562.html?.tsrc=rtlde/
http://www.guardian.co.uk/commentisfree/cifamerica/2011/jun/22/hacking-anonymous
http://lists.mysql.com/mysql/209024
https://dev.mysql.com/doc/refman/5.0/en/innodb-backup.html
http://politicaloutcast.com/2013/07/montana-enacts-law-prohibiting-nsa-spying/#ixzz2YwtbnXid
http://rt.com/news/snowden-meet-human-rights-000/
'https://www.ksl.com/index.php?nid=148&sid=25978926'
https://www.ksl.com/index.php?nid=148&sid=25978926
http://thehill.com/blogs/global-affairs/human-rights/310773-obama-administration-urges-human-rights-groups-to-abandon-snowden
http://www.ancestry.com/wiki/index.php?title=User:Paulmd199
http://www.dw.de/nsa-claims-surveillance-program-foiled-50-terror-plots/a-16893066
http://www.globalresearch.ca/echelon-today-the-evolution-of-an-nsa-black-program/5342646?utm_source=rss&utm_medium=rss&utm_campaign=echelon-today-the-evolution-of-an-nsa-black-program
'http://www.ksl.com/index.php?nid=148&sid=25978926'
http://www.ksl.com/index.php?nid=148&sid=25978926
http://www.mcclatchydc.com/2013/07/12/196567/edward-snowden-asks-russia-for.html#.UeCVwVIiiRl
http://www.nationofchange.org/snowden-s-constitution-obama-s-constitution-and-criminal-law-1372943055

__END__
Implementing basename in SQL:
CREATE TEMPORARY TABLE t (`id` mediumint unique, `size` mediumint);
SELECT RIGHT(url, LOCATE('/',REVERSE(url))-1) FROM articles;
#############################################################################
