#!/usr/bin/env ruby
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
require 'net/http'
require 'fileutils'
require 'rexml/document'
# A giant sucking sound.
module Sucker
DIR_EXCL = ['..', '.']
# Representation of a repository artifact. Mostly based on Maven with
# all the groupId and artifactId s**t.
class Artifact
attr_reader :group, :name, :version
def initialize(group, name, version)
@group, @name, @version = group, name, version
end
def to_yaml_properties
%w{ @group @name @version }
end
def ==(art) eql?(art); end
def ===(art) eql?(art); end
def eql?(art)
false if art.class != Artifact
art.group == @group && art.name == @name && art.version == @version
end
def hash
@group.hash ^ (@name.hash << 1) ^ (@version.hash << 2)
end
def to_path
"#{@group.gsub('.', '/')}/#{@name}/#{@version}/#{@name}-#{@version}.jar"
end
def to_s
"#{name} v#{version} (project #{group})"
end
end
# Enumerates on a Maven repository by scraping HTML. Yields on every
# artifact found in there.
class Maven2Repository
EXCLUDE = ["sha1", "md5", "pom"]
attr_writer :group_filters
def group_filters
@group_filters
end
def initialize(server, port, url, proxy_info=nil)
@server, @url, @port, @proxy_info = server, url, port, proxy_info
end
def each
if @proxy_info
http = Net::HTTP::Proxy(*@proxy_info).new(@server, @port)
else
http = Net::HTTP.new(@server, @port)
end
folder_regexp = /
/
file_regexp = /
/
urls = @group_filters || ['/']
while !urls.empty?
url = urls.shift
response = http.get(@url + url)
if (folder_regexp =~ response.body)
response.body.scan(folder_regexp) do |line|
folder_url = "#{url}#{line[51..(line.length - 4)]}/"
# DFS in inverse order (start with 'z'), change to << for BFS
urls.unshift(folder_url)
end
else
url_arr = url.split('/')
version = url_arr.pop
name = url_arr.pop
group = url_arr.join('.')
if (file_regexp =~ response.body)
artifact = Sucker::Artifact.new(group, name, version)
yield(artifact, http)
end
end
end
end
end
class MavenPom
def initialize(xml)
@doc = REXML::Document.new(xml)
end
def licenses
lics = []
@doc.elements.each("//license") do |le|
lic = {}
le.elements.each do |lce|
unless REXML::Text === lce
lic[lce.name] = lce.text.chomp
end
end
lics << lic
end
lics.empty? ? nil : lics
end
def parent
parent = {}
@doc.elements.each("//parent/*") do |pe|
unless REXML::Text === pe
parent[pe.name] = pe.text.chomp
end
end
parent.empty? ? nil : parent
end
[:artifactId, :groupId, :version, :name].each do |meth|
define_method(meth) do
elmts = @doc.elements["//#{meth}"]
elmts.text.chomp if elmts
end
end
end
class Serializer
# Conversion from license name to its URL
LICENSE = [
[/Apache.*2/, "http://www.apache.org/licenses/LICENSE-2.0.txt"],
[/Apache.*1\.1/, "http://www.apache.org/licenses/LICENSE-1.1"],
[/Apache/, "http://www.apache.org/licenses/LICENSE-1.0"],
[/BSD/, "http://www.opensource.org/licenses/bsd-license.php"],
[/Public Domain/, "http://creativecommons.org/licenses/publicdomain/"],
[/(LGPL)|(GNU Library).*3/, "http://www.gnu.org/licenses/lgpl-3.0.txt"],
[/(LGPL)|(GNU Library)/, "http://opensource.org/licenses/lgpl-license.php"],
[/(GPL)|(GNU Public License).*3/, "http://www.gnu.org/licenses/gpl-3.0.txt"],
[/(GPL)|(GNU Public License)/, "http://opensource.org/licenses/gpl-license.php"],
[/(MIT)|(Massachusetts)/, "http://opensource.org/licenses/mit-license.php"],
[/.*/, "Unknown"]
]
# Instantiate with the pom of the project to serialize
# information for and the one containing the licenses
# (can be the same, nil, or a parent project pom).
def initialize(pom, license_pom)
@pom = pom; @license_pom = license_pom
end
def write
# Building the doc, no matter the language XML is verbose
doc = REXML::Document.new
rdf = doc.add_element("rdf:RDF")
rdf.add_namespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
rdf.add_namespace("doap", "http://usefulinc.com/ns/doap#")
rdf.add_namespace("dsc", "http://apache.org/ns/2007/discordia#")
project = rdf.add_element("doap:Project")
project.add_attribute("rdf:about", "mvn://#{@pom.groupId}/#{@pom.artifactId}/#{@pom.version}")
project.add_element("doap:name").text = @pom.name if @pom.name
(@license_pom || @pom).licenses.each do |lic_info|
license = project.add_element("dis:has_license")
off_url, url = sanitize(lic_info["name"], lic_info["url"])
license.add_attribute("rdf:about", off_url)
license.add_element("dis:source").text = url
end
# Saving it
doc.write(File.new("#{@pom.groupId}-#{@pom.artifactId}-#{@pom.version}.rdf", "w"), 0)
end
private
# Making our best to reconcile the license info
def sanitize(name, url)
# Converting the name to a standard url
if name
off_url = LICENSE.select { |match, lic_url| match=~name }.first[1]
url ? [off_url, url] : [off_url, off_url]
else
[url, url]
end
end
end
end
repo_sucker = Sucker::Maven2Repository.new("repo1.maven.org", "80", "/maven2")
repo_sucker.each do |artifact, http|
# Getting the pom for our artifact by replacing the extensions
response = http.get("/maven2#{artifact.to_path[0..-5]}.pom")
if Net::HTTPNotFound === response
# puts "No pom found for #{artifact}"
else
original_pom = Sucker::MavenPom.new(response.body)
pom = original_pom
while pom
licenses = pom.licenses
if licenses
# Licenses are in the POM, all is well
puts "Licenses for #{artifact} => #{licenses.inspect}"
Sucker::Serializer.new(original_pom, pom).write
break
else
# No license found, checking if this POM declares a parent
parent = pom.parent
if parent
# Checking if we can find a license in the parent
parent_artifact = Sucker::Artifact.new(parent["groupId"], parent["artifactId"], parent["version"])
parent_resp = http.get("/maven2/#{parent_artifact.to_path[0..-5]}.pom")
if parent_resp.code == "200"
pom = Sucker::MavenPom.new(parent_resp.body)
else
pom = nil
end
else
pom = nil
end
end
end
end
end