summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid A. Madore <david+git@madore.org>2010-03-07 23:08:22 (GMT)
committerDavid A. Madore <david+git@madore.org>2010-03-07 23:08:22 (GMT)
commitfe246fdcd6ca8a7863ae6e80e58ab7c544feb820 (patch)
tree2dd1b5dd34336b99347cc690c3ecdc7d6c035d79
downloadblogengine-fe246fdcd6ca8a7863ae6e80e58ab7c544feb820.zip
blogengine-fe246fdcd6ca8a7863ae6e80e58ab7c544feb820.tar.gz
blogengine-fe246fdcd6ca8a7863ae6e80e58ab7c544feb820.tar.bz2
Initial script for inserting weblog entries in database.
-rwxr-xr-xinsert-entries.pl165
-rw-r--r--weblog.sql12
2 files changed, 177 insertions, 0 deletions
diff --git a/insert-entries.pl b/insert-entries.pl
new file mode 100755
index 0000000..bde5aa2
--- /dev/null
+++ b/insert-entries.pl
@@ -0,0 +1,165 @@
+#! /usr/local/bin/perl -w
+
+# Insert weblog entries into a PostgreSQL database.
+
+require 5.10.0;
+
+use strict;
+use warnings;
+
+use Encode;
+
+use XML::LibXML qw(:libxml :ns);
+
+use DBI;
+
+use Digest::SHA1 qw(sha1_hex);
+
+use Getopt::Std;
+
+use constant {
+ XHTML_PUBID => "-//W3C//DTD XHTML 1.0 Strict//EN",
+ XHTML_URI => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd",
+ XHTML_NS => "http://www.w3.org/1999/xhtml",
+};
+
+my %opts;
+
+getopts('cd:', \%opts);
+
+my $dbname = "weblog" // $opts{d}; # The PostgreSQL database to use
+my $obtain_cdates = $opts{c};
+
+my $parser = XML::LibXML->new();
+
+
+# Functions to take care of the x-daml-magic URI scheme (this should go away)
+
+sub match_daml_magic_uri {
+ my $uri = shift;
+ return $uri =~ /^x-daml-magic\:/;
+}
+
+my %daml_magic_values = (
+ "x-daml-magic://daml/weblog/domaxentries.xml" => "INCLUDE",
+ "x-daml-magic://daml/weblog/maxentries.xml" => "20",
+ "x-daml-magic://daml/weblog/domonth.xml" => "IGNORE",
+ "x-daml-magic://daml/weblog/month.xml" => "",
+ "x-daml-magic://daml/weblog/docategory.xml" => "IGNORE",
+ "x-daml-magic://daml/weblog/category-en.xml" => "",
+ "x-daml-magic://daml/weblog/category-fr.xml" => "",
+ );
+
+sub open_daml_magic_uri {
+ my $uri = shift;
+ my $s = $daml_magic_values{$uri};
+ if ( defined($s) ) {
+ return \$s;
+ } else {
+ return undef;
+ }
+}
+
+sub read_daml_magic_uri {
+ my $handler = shift;
+ my $length = shift;
+ my $buf = substr($$handler, 0, $length, "");
+ return $buf;
+}
+
+sub close_daml_magic_uri {
+ my $handler = shift;
+ return "0 but true";
+}
+
+my $input_callbacks = XML::LibXML::InputCallback->new();
+$input_callbacks->register_callbacks([ \&match_daml_magic_uri,
+ \&open_daml_magic_uri,
+ \&read_daml_magic_uri,
+ \&close_daml_magic_uri ]);
+$parser->input_callbacks($input_callbacks);
+
+# Avoid accessing the W3C web site all the time.
+$parser->load_catalog("/etc/xml/catalog");
+
+my $doc;
+if ( defined($ARGV[0]) ) {
+ $doc = $parser->parse_file($ARGV[0]);
+} else {
+ $doc = $parser->parse_fh(\*STDIN);
+}
+
+my $dbh = DBI->connect("dbi:Pg:dbname=$dbname", "", "", {AutoCommit=>1,PrintError=>1,pg_enable_utf8=>1})
+ or die "Can't connect to database";
+$dbh->do("SET TIME ZONE 0")
+ or die "Can't set timezone";
+
+my ($sth_check, $sth_insert, $sth_update);
+$sth_check = $dbh->prepare("SELECT sha1 FROM entries WHERE id=?")
+ or die "Can't prepare statement";
+if ( $obtain_cdates ) {
+ $sth_insert = $dbh->prepare("INSERT INTO entries(id,edate,lang,title,content,sha1,cdate) VALUES (?,?,?,?,?,?,?)")
+ or die "Can't prepare statement";
+ $sth_update = $dbh->prepare("UPDATE entries SET (edate,mdate,lang,title,content,sha1,cdate)=(?,DEFAULT,?,?,?,?,?) WHERE id=?")
+ or die "Can't prepare statement";
+} else {
+ $sth_insert = $dbh->prepare("INSERT INTO entries(id,edate,lang,title,content,sha1) VALUES (?,?,?,?,?,?)")
+ or die "Can't prepare statement";
+ $sth_update = $dbh->prepare("UPDATE entries SET (edate,mdate,lang,title,content,sha1)=(?,DEFAULT,?,?,?,?) WHERE id=?")
+ or die "Can't prepare statement";
+}
+
+sub get_node_lang {
+ my $node = shift;
+ while ( defined($node) ) {
+ my $lang = $node->getAttributeNS(XML_XML_NS, "lang");
+ return $lang if defined($lang);
+# $lang = $node->getAttribute("lang");
+# return $lang if defined($lang);
+ $node = $node->parentNode;
+ }
+}
+
+my $xpc = XML::LibXML::XPathContext->new($doc);
+#$xpc->registerNs('h', XHTML_NS);
+my $entry_list = $xpc->findnodes("//weblog/entry");
+foreach my $node ( $entry_list->get_nodelist ) {
+ my $id = $node->getAttribute("number")+0;
+ my $date = $node->getAttribute("date");
+ my $lang = get_node_lang $node;
+ $node->setAttributeNS(XML_XML_NS, "lang", $lang) if defined($lang);
+ my $str = $node->serialize();
+ my $sha1 = sha1_hex(encode("utf8",$str));
+ $sth_check->execute($id) or die "Can't check sha1 for entry";
+ my $exists = $sth_check->rows;
+ next if $exists && ($sth_check->fetchrow_array)[0] eq $sha1;
+ my $title_node = ($xpc->findnodes("title", $node))[0];
+ my $title = defined($title_node)?$title_node->textContent:undef;
+ my $cdate;
+ if ( $obtain_cdates ) {
+ if ( $date =~ /^(\d+)-03-32$/ ) {
+ $cdate = "$1-04-01 12:00:00+00";
+ } else {
+ $cdate = "$date 12:00:00+00";
+ }
+ }
+ if ( $exists ) {
+ print STDERR "Updating entry $id\n";
+ if ( $obtain_cdates ) {
+ $sth_update->execute($date, $lang, $title, $str, $sha1, $cdate, $id)
+ or die "Can't update entry";
+ } else {
+ $sth_update->execute($date, $lang, $title, $str, $sha1, $id)
+ or die "Can't update entry";
+ }
+ } else {
+ print STDERR "Registering entry $id\n";
+ if ( $obtain_cdates ) {
+ $sth_insert->execute($id, $date, $lang, $title, $str, $sha1, $cdate)
+ or die "Can't register entry";
+ } else {
+ $sth_insert->execute($id, $date, $lang, $title, $str, $sha1)
+ or die "Can't register entry";
+ }
+ }
+}
diff --git a/weblog.sql b/weblog.sql
new file mode 100644
index 0000000..603dab1
--- /dev/null
+++ b/weblog.sql
@@ -0,0 +1,12 @@
+SET TIME ZONE 0;
+CREATE TABLE entries (
+ id integer PRIMARY KEY ,
+ edate text NOT NULL ,
+ cdate timestamp with time zone NOT NULL DEFAULT CURRENT_TIMESTAMP ,
+ mdate timestamp with time zone NOT NULL DEFAULT CURRENT_TIMESTAMP ,
+ lang text ,
+ title text ,
+ content xml NOT NULL,
+ sha1 text NOT NULL
+) ;
+CREATE INDEX entries_edate_key ON entries ( edate ) ;