#!/usr/bin/env ruby # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. require 'net/http' require 'fileutils' require 'rexml/document' # A giant sucking sound. module Sucker DIR_EXCL = ['..', '.'] # Representation of a repository artifact. Mostly based on Maven with # all the groupId and artifactId s**t. class Artifact attr_reader :group, :name, :version def initialize(group, name, version) @group, @name, @version = group, name, version end def to_yaml_properties %w{ @group @name @version } end def ==(art) eql?(art); end def ===(art) eql?(art); end def eql?(art) false if art.class != Artifact art.group == @group && art.name == @name && art.version == @version end def hash @group.hash ^ (@name.hash << 1) ^ (@version.hash << 2) end def to_path "#{@group.gsub('.', '/')}/#{@name}/#{@version}/#{@name}-#{@version}.jar" end def to_s "#{name} v#{version} (project #{group})" end end # Enumerates on a Maven repository by scraping HTML. Yields on every # artifact found in there. class Maven2Repository EXCLUDE = ["sha1", "md5", "pom"] attr_writer :group_filters def group_filters @group_filters end def initialize(server, port, url, proxy_info=nil) @server, @url, @port, @proxy_info = server, url, port, proxy_info end def each if @proxy_info http = Net::HTTP::Proxy(*@proxy_info).new(@server, @port) else http = Net::HTTP.new(@server, @port) end folder_regexp = /\[DIR\] / file_regexp = /\[   \] / urls = @group_filters || ['/'] while !urls.empty? url = urls.shift response = http.get(@url + url) if (folder_regexp =~ response.body) response.body.scan(folder_regexp) do |line| folder_url = "#{url}#{line[51..(line.length - 4)]}/" # DFS in inverse order (start with 'z'), change to << for BFS urls.unshift(folder_url) end else url_arr = url.split('/') version = url_arr.pop name = url_arr.pop group = url_arr.join('.') if (file_regexp =~ response.body) artifact = Sucker::Artifact.new(group, name, version) yield(artifact, http) end end end end end class MavenPom def initialize(xml) @doc = REXML::Document.new(xml) end def licenses lics = [] @doc.elements.each("//license") do |le| lic = {} le.elements.each do |lce| unless REXML::Text === lce lic[lce.name] = lce.text.chomp end end lics << lic end lics.empty? ? nil : lics end def parent parent = {} @doc.elements.each("//parent/*") do |pe| unless REXML::Text === pe parent[pe.name] = pe.text.chomp end end parent.empty? ? nil : parent end [:artifactId, :groupId, :version, :name].each do |meth| define_method(meth) do elmts = @doc.elements["//#{meth}"] elmts.text.chomp if elmts end end end class Serializer # Conversion from license name to its URL LICENSE = [ [/Apache.*2/, "http://www.apache.org/licenses/LICENSE-2.0.txt"], [/Apache.*1\.1/, "http://www.apache.org/licenses/LICENSE-1.1"], [/Apache/, "http://www.apache.org/licenses/LICENSE-1.0"], [/BSD/, "http://www.opensource.org/licenses/bsd-license.php"], [/Public Domain/, "http://creativecommons.org/licenses/publicdomain/"], [/(LGPL)|(GNU Library).*3/, "http://www.gnu.org/licenses/lgpl-3.0.txt"], [/(LGPL)|(GNU Library)/, "http://opensource.org/licenses/lgpl-license.php"], [/(GPL)|(GNU Public License).*3/, "http://www.gnu.org/licenses/gpl-3.0.txt"], [/(GPL)|(GNU Public License)/, "http://opensource.org/licenses/gpl-license.php"], [/(MIT)|(Massachusetts)/, "http://opensource.org/licenses/mit-license.php"], [/.*/, "Unknown"] ] # Instantiate with the pom of the project to serialize # information for and the one containing the licenses # (can be the same, nil, or a parent project pom). def initialize(pom, license_pom) @pom = pom; @license_pom = license_pom end def write # Building the doc, no matter the language XML is verbose doc = REXML::Document.new rdf = doc.add_element("rdf:RDF") rdf.add_namespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#") rdf.add_namespace("doap", "http://usefulinc.com/ns/doap#") rdf.add_namespace("dsc", "http://apache.org/ns/2007/discordia#") project = rdf.add_element("doap:Project") project.add_attribute("rdf:about", "mvn://#{@pom.groupId}/#{@pom.artifactId}/#{@pom.version}") project.add_element("doap:name").text = @pom.name if @pom.name (@license_pom || @pom).licenses.each do |lic_info| license = project.add_element("dis:has_license") off_url, url = sanitize(lic_info["name"], lic_info["url"]) license.add_attribute("rdf:about", off_url) license.add_element("dis:source").text = url end # Saving it doc.write(File.new("#{@pom.groupId}-#{@pom.artifactId}-#{@pom.version}.rdf", "w"), 0) end private # Making our best to reconcile the license info def sanitize(name, url) # Converting the name to a standard url if name off_url = LICENSE.select { |match, lic_url| match=~name }.first[1] url ? [off_url, url] : [off_url, off_url] else [url, url] end end end end repo_sucker = Sucker::Maven2Repository.new("repo1.maven.org", "80", "/maven2") repo_sucker.each do |artifact, http| # Getting the pom for our artifact by replacing the extensions response = http.get("/maven2#{artifact.to_path[0..-5]}.pom") if Net::HTTPNotFound === response # puts "No pom found for #{artifact}" else original_pom = Sucker::MavenPom.new(response.body) pom = original_pom while pom licenses = pom.licenses if licenses # Licenses are in the POM, all is well puts "Licenses for #{artifact} => #{licenses.inspect}" Sucker::Serializer.new(original_pom, pom).write break else # No license found, checking if this POM declares a parent parent = pom.parent if parent # Checking if we can find a license in the parent parent_artifact = Sucker::Artifact.new(parent["groupId"], parent["artifactId"], parent["version"]) parent_resp = http.get("/maven2/#{parent_artifact.to_path[0..-5]}.pom") if parent_resp.code == "200" pom = Sucker::MavenPom.new(parent_resp.body) else pom = nil end else pom = nil end end end end end