Added: branches/eprints-integration/lib/dublin_core.rb (0 => 2408)
--- branches/eprints-integration/lib/dublin_core.rb (rev 0)
+++ branches/eprints-integration/lib/dublin_core.rb 2010-05-17 15:10:31 UTC (rev 2408)
@@ -0,0 +1,68 @@
+# myExperiment: lib/dublin_core.rb
+#
+# Copyright (c) 2010 University of Manchester and the University of Southampton.
+# See license.txt for details
+
+require 'nokogiri'
+
+module DublinCore
+
+ SCHEMA_URIS = [
+ 'http://purl.org/DC/elements/1.0/',
+ 'http://purl.org/DC/elements/1.1/'
+ ]
+
+ FIELDS = [
+ 'title',
+ 'creator',
+ 'subject',
+ 'description',
+ 'publisher',
+ 'contributer',
+ 'date',
+ 'type',
+ 'format',
+ 'identifier',
+ 'source',
+ 'language',
+ 'relation',
+ 'coverage',
+ 'rights'
+ ]
+
+ def self.extract_dublin_core_data_from_page(uri)
+ doc = Nokogiri::HTML(open(uri))
+ dcns = nil
+ DublinCore::SCHEMA_URIS.each do |schema_uri|
+ rel = doc.xpath("/html/head/link[regex(., '#{Regexp.escape(schema_uri)}')]/@rel", Class.new {
+ def regex(node_set, regex)
+ node_set.find_all { |node| node['href'] =~ /#{regex}/i }
+ end
+ }.new)
+ rel.each do |rel_node|
+ matches = rel_node.value.match(/schema\.(\w+)/i)
+ unless matches.nil?
+ dcns = matches[1]
+ break
+ end
+ end
+ break unless dcns.nil?
+ end
+ meta = doc.xpath("/html/head/meta[regex(., '^#{Regexp.escape(dcns)}\\.\\w+$')]", Class.new {
+ def regex(node_set, regex)
+ node_set.find_all { |node| node['name'] =~ /#{regex}/i }
+ end
+ }.new)
+ dcdata = Hash.new
+ meta.each do |meta_tag|
+ field = meta_tag['name']
+ field =~ /#{Regexp.escape(dcns)}\\.(\\w+$)/i
+ puts field
+ if DublinCore::FIELDS.map{|dcfield| "#{dcns}.#{dcfield}".downcase}.include?(meta_tag['name'].downcase)
+ dcdata[meta_tag['name'].downcase] = meta_tag['content']
+ end
+ end
+ dcdata
+ end
+
+end