#!/usr/bin/perl

package App::Licensecheck;

use v5.14;
use utf8;
use strictures 2;
use warnings qw(FATAL utf8);
use charnames qw(:full :short);
no warnings "nonchar";
no warnings "surrogate";
no warnings "non_unicode";
use autodie;

use version;
use Path::Tiny;
use Fcntl qw/:seek/;
use Encode;
use Types::Standard qw< Bool Str Maybe InstanceOf >;
use Types::Common::Numeric qw< PositiveOrZeroInt >;
use Type::Utils qw< declare as via coerce from >;

use Moo;
use namespace::clean;

=head1 NAME

App::Licensecheck - functions for a simple license checker for source files

=head1 VERSION

Version v3.0.0

=cut

our $VERSION = version->declare("v3.0.0");

=head1 SYNOPSIS

    use App::Licensecheck;

    my $app = App::Licensecheck->new;

    $app->lines(0); # Speedup parsing - our file is not huge

    printf "License: %s\nCopyright: %s\n", $app->parse( 'some-file' );

=head1 DESCRIPTION

L<App::Licensecheck> is the core of C<licensecheck> script
to check for licenses of source files.
See the script for casual usage.

=cut

# also used to cleanup
my $copyright_indicator_regex = qr!
	(?:copyright(?:-holders?)? # The full word (or slightly more)
		|copr\. # Legally-valid abbreviation
		|\xc2\xa9 # Unicode copyright sign encoded in iso8859
		|\x{00a9} # Unicode character COPYRIGHT SIGN
		#|© # Unicode character COPYRIGHT SIGN
		|\(c\) # Legally-null representation of sign
	)
!lix;

my $copyright_indicator_regex_with_capture
	= qr!$copyright_indicator_regex(?::\s*|\s+)(\S.*)$!lix;

# avoid ditching things like <info@foo.com>
my $copyright_disindicator_regex = qr{
	\b(?:info(?:rmation)?(?!@) # Discussing copyright information
	|(notice|statement|claim|string)s? # Discussing the notice
	|is|in|to # Part of a sentence
	|(holder|owner)s? # Part of a sentence
	|ownership # Part of a sentence
	)\b
}ix;

my $copyright_predisindicator_regex = qr!(
	^[#]define\s+.*\(c\) # #define foo(c) -- not copyright
)!ix;

my $EncodingRef = declare as InstanceOf [qw(Encode::Encoding Encode::XS)];
coerce $EncodingRef, from Str, via { find_encoding($_) };

has lines => (
	is      => 'rw',
	isa     => PositiveOrZeroInt,
	default => sub {60},
);

has tail => (
	is      => 'rw',
	isa     => PositiveOrZeroInt,
	default => sub {5000},          # roughly 60 lines of 80 chars
);

has encoding => (
	is     => 'rw',
	isa    => Maybe [$EncodingRef],
	coerce => $EncodingRef->coercion,
);

has verbose => (
	is  => 'rw',
	isa => Bool,
);

has deb_fmt => (
	is  => 'rw',
	isa => Bool,
);

sub parse
{
	my $self = shift;
	my $file = path(shift);

	if ( $self->lines == 0 ) {
		return ( $self->parse_file($file) );
	}
	else {
		return ( $self->parse_lines($file) );
	}
}

sub parse_file
{
	my $self = shift;
	my $file = path(shift);

	my $content;

	if ( not $self->encoding ) {
		$content = $file->slurp_raw;
	}
	elsif ( $self->encoding->name eq 'utf8' ) {
		$content = $file->slurp_utf8;
	}
	else {
		$content = $file->slurp(
			{ binmode => sprintf ':encoding(%s)', $self->encoding->name } );
	}

	my %copyrights = extract_copyright($content);

	print qq(----- $file content -----\n$content----- end content -----\n\n)
		if $self->verbose;

	my $license
		= $self->parse_license(
		clean_cruft_and_spaces( clean_comments($content) ) );
	my $copyright = join( " / ", reverse sort values %copyrights );

	return ( $license, $copyright );
}

sub parse_lines
{
	my $self      = shift;
	my $file      = path(shift);
	my $content   = '';
	my $copyright = '';

	my $fh;
	my $st = $file->stat;

	if ( not $self->encoding ) {
		$fh = $file->openr_raw;
	}
	elsif ( $self->encoding->name eq 'utf8' ) {
		$fh = $file->openr_utf8;
	}
	else {
		$fh = $file->openr( sprintf ':encoding(%s)', $self->encoding->name );
	}

	while ( my $line = $fh->getline ) {
		last if ( $fh->input_line_number > $self->lines );
		$content .= $line;
	}

	my %copyrights = extract_copyright($content);

	print qq(----- $file header -----\n$content----- end header -----\n\n)
		if $self->verbose;

	my $license
		= $self->parse_license(
		clean_cruft_and_spaces( clean_comments($content) ) );
	$copyright = join( " / ", reverse sort values %copyrights );

	if ( not $copyright and $license eq 'UNKNOWN' ) {
		my $position = $fh->tell;                 # See IO::Seekable
		my $jump     = $st->size - $self->tail;
		$jump = $position if $jump < $position;

		my $tail = '';
		if ( $self->tail and $jump < $st->size ) {
			$fh->seek( $jump, SEEK_SET );         # also IO::Seekable
			$tail .= join( '', $fh->getlines );
		}

		print qq(----- $file tail -----\n$tail----- end tail -----\n\n)
			if $self->verbose;

		%copyrights = extract_copyright($tail);
		$license
			= $self->parse_license(
			clean_cruft_and_spaces( clean_comments($tail) ) );
		$copyright = join( " / ", reverse sort values %copyrights );
	}

	$fh->close;
	return ( $license, $copyright );
}

sub extract_copyright
{
	my $content = shift;
	my @c = split /\n/, clean_comments($content);

	my %copyrights;
	my $lines_after_copyright_block = 0;

	my $in_copyright_block = 0;
	while (@c) {
		my $line = shift @c;
		my $copyright_match = parse_copyright( $line, \$in_copyright_block );
		if ($copyright_match) {
			while ( @c and $copyright_match =~ /\d[,.]?\s*$/ ) {

   # looks like copyright end with a year, assume the owner is on next line(s)
				$copyright_match .= ' ' . shift @c;
			}
			$copyright_match =~ s/\s+/ /g;
			$copyright_match =~ s/\s*$//;
			$copyrights{ lc("$copyright_match") } = "$copyright_match";
		}
		elsif ( scalar keys %copyrights ) {

 # skip remaining lines if a copyright blocks was found more than 5 lines ago.
 # so a copyright block may contain up to 5 blank lines, but no more
			last if $lines_after_copyright_block++ > 5;
		}
	}
	return %copyrights;
}

sub parse_copyright
{
	my $data                   = shift;
	my $in_copyright_block_ref = shift;
	my $copyright              = '';
	my $match;

	if ( $data !~ $copyright_predisindicator_regex ) {

		#print "match against ->$data<-\n";
		if ( $data =~ $copyright_indicator_regex_with_capture ) {
			$match                   = $1;
			$$in_copyright_block_ref = 1;

			# Ignore lines matching "see foo for copyright information" etc.
			if ( $match !~ $copyright_disindicator_regex ) {

				# De-cruft
				$match =~ s/$copyright_indicator_regex//igx;
				$match =~ s/^\s+//;
				$match =~ s/\s*\bby\b\s*/ /;
				$match =~ s/([,.])?\s*$//;
				$match =~ s/\s{2,}/ /g;
				$match =~ s/\\//g;    # de-cruft nroff files
				$match =~ s/\s*[*#]\s*$//;
				$copyright = $match;
			}
		}
		elsif ( $$in_copyright_block_ref and $data =~ /^\d{2,}[,\s]+/ ) {

			# following lines beginning with a year are supposed to be
			# continued copyright blocks
			$copyright = $data;
		}
		else {
			$$in_copyright_block_ref = 0;
		}
	}
	return $copyright;
}

sub clean_comments
{
	local $_ = shift or return q{};

	# Remove generic comments: look for 4 or more lines beginning with
	# regular comment pattern and trim it. Fall back to old algorithm
	# if no such pattern found.
	my @matches = m/^\s*((?:[^a-zA-Z0-9\s]{1,3}|\bREM\b))\s\w/mg;
	if ( @matches >= 4 ) {
		my $comment_re = qr/\s*[\Q$matches[0]\E]{1,3}\s*/;
		s/^$comment_re//mg;
	}

	# Remove Fortran comments
	s/^[cC] //gm;

	# Remove C / C++ comments
	s#(\*/|/[/*])##g;

	return $_;
}

sub clean_cruft_and_spaces
{
	local $_ = shift or return q{};

	tr/\t\r\n/ /;

	# this also removes quotes
	tr% A-Za-z.,@;0-9\(\)/-%%cd;
	tr/ //s;

	return $_;
}

sub parse_license
{
	my $self = shift;
	my ($licensetext) = @_;

	my $gplver    = "";
	my $extrainfo = "";
	my $license   = "";
	my @spdx_gplver;

  # @spdx_license contains identifiers from https://spdx.org/licenses/
  # it would be more efficient to store license info only in this
  # array and then convert it to legacy formulation, but there are
  # corner case (like extrainfo) that would not fit. So the old storage scheme
  # is kept with the new (spdx/dep-5) scheme to keep backward compat.
	my @spdx_license;
	my $spdx_extra;
	my $gen_spdx = sub {
		my @ret
			= @spdx_gplver ? ( map { "$_[0]-$_"; } @spdx_gplver ) : ( $_[0] );
		push @ret, $spdx_extra if $spdx_extra;
		return @ret;
	};

	if ( $licensetext
		=~ /version ([^ ]+)(?: of the License)?,? or(?: \(at your option\))? version (\d(?:[.-]\d+)*)/
		)
	{
		$gplver = " (v$1 or v$2)";
		@spdx_gplver = ( $1, $2 );
	}
	elsif ( $licensetext
		=~ /version ([^, ]+?)[.,]? (?:\(?only\)?.? )?(?:of the GNU (Affero )?(Lesser |Library )?General Public License )?(as )?published by the Free Software Foundation/i
		or $licensetext
		=~ /GNU (?:Affero )?(?:Lesser |Library )?General Public License (?:as )?published by the Free Software Foundation[;,] version ([^, ]+?)[.,]? /i
		)
	{
		$gplver      = " (v$1)";
		@spdx_gplver = ($1);
	}
	elsif ( $licensetext
		=~ /GNU (?:Affero )?(?:Lesser |Library )?General Public License\s*(?:[(),GPL]+)\s*version (\d+(?:\.\d+)?)[ \.]/i
		)
	{
		$gplver      = " (v$1)";
		@spdx_gplver = ($1);
	}
	elsif ( $licensetext
		=~ /either version ([^ ]+)(?: of the License)?, or (?:\(at your option\) )?any later version/
		)
	{
		$gplver      = " (v$1 or later)";
		@spdx_gplver = ( $1 . '+' );
	}
	elsif ( $licensetext
		=~ /GPL\sas\spublished\sby\sthe\sFree\sSoftware\sFoundation,\sversion\s([\d.]+)/i
		)
	{
		$gplver      = " (v$1)";
		@spdx_gplver = ($1);
	}
	if ( $licensetext
		=~ /(?:675 Mass Ave|59 Temple Place|51 Franklin Steet|02139|02111-1307)/i
		)
	{
		$extrainfo = " (with incorrect FSF address)$extrainfo";
	}
	if ( $licensetext
		=~ /permission (?:is (also granted|given))? to link (the code of )?this program with (any edition of )?(Qt|the Qt library)/i
		)
	{
		$extrainfo  = " (with Qt exception)$extrainfo";
		$spdx_extra = 'with Qt exception';
	}

	# exclude blurb found in boost license text
	if ( $licensetext
		=~ /(All changes made in this file will be lost|DO NOT ((?:HAND )?EDIT|delete this file|modify)|edit the original|Generated (automatically|by|from|data)|generated.*file)/i
		and $licensetext
		!~ /unless such copies or derivative works are solely in the form of machine-executable object code generated by a source language processor/
		)
	{
		$license = "GENERATED FILE";
		push @spdx_license, 'generated-file';
	}
	if ( $licensetext
		=~ /(are made available|(is free software.? )?you can redistribute (it|them) and(?:\/|\s+)or modify (it|them)|is licensed) under the terms of (version [^ ]+ of )?the (GNU (Library |Lesser )General Public License|LGPL)/i
		)
	{
		$license = "LGPL$gplver$extrainfo $license";
		push @spdx_license, $gen_spdx->('LGPL');
	}

	# For Perl modules handled by Dist::Zilla
	elsif ( $licensetext
		=~ /this is free software,? licensed under:? (?:the )?(?:GNU (?:Library |Lesser )General Public License|LGPL),? version ([\d\.]+)/i
		)
	{
		$license = "LGPL (v$1) $license";
		push @spdx_license, "LGPL-$1";
	}
	if ( $licensetext
		=~ /is free software.? you can redistribute (it|them) and(?:\/|\s+)or modify (it|them) under the terms of the (GNU Affero General Public License|AGPL)/i
		)
	{
		$license = "AGPL$gplver$extrainfo $license";
		push @spdx_license, $gen_spdx->('AGPL');
	}
	if ( $licensetext
		=~ /(is free software.? )?you (can|may) redistribute (it|them) and(?:\/|\s+)or modify (it|them) under the terms of (?:version [^ ]+ (?:\(?only\)? )?of )?the GNU General Public License/i
		)
	{
		$license = "GPL$gplver$extrainfo $license";
		push @spdx_license, $gen_spdx->('GPL');
	}
	if ( $licensetext
		=~ /is distributed under the terms of the GNU General Public License,/
		and length $gplver )
	{
		$license = "GPL$gplver$extrainfo $license";
		push @spdx_license, $gen_spdx->('GPL');
	}
	if ( $licensetext
		=~ /(?:is|may be)\s(?:(?:distributed|used).*?terms|being\s+released).*?\b(L?GPL)\b/
		)
	{
		my $v = $gplver || ' (unversioned/unknown version)';
		$license = "$1$v $license";
		push @spdx_license, $gen_spdx->($1);
	}
	if ( $licensetext
		=~ /the rights to distribute and use this software as governed by the terms of the Lisp Lesser General Public License|\bLLGPL\b/
		)
	{
		$license = "LLGPL $license";
		push @spdx_license, 'LLGPL';
	}
	if ( $licensetext
		=~ /This file is part of the .*Qt GUI Toolkit. This file may be distributed under the terms of the Q Public License as defined/
		)
	{
		$license = "QPL (part of Qt) $license";
	}
	elsif ( $licensetext
		=~ /may (be distributed|redistribute it) under the terms of the Q Public License/
		)
	{
		$license = "QPL $license";
		push @spdx_license, 'QPL';
	}
	if ( $licensetext =~ /opensource\.org\/licenses\/mit-license\.php/ ) {
		$license = "MIT/X11 (BSD like) $license";
		push @spdx_license, 'Expat';
	}
	elsif ( $licensetext
		=~ /Permission is hereby granted, free of charge, to any person obtaining a copy of this software and(\/or)? associated documentation files \(the (Software|Materials)\), to deal in the (Software|Materials)/
		)
	{
		$license = "MIT/X11 (BSD like) $license";
		push @spdx_license, 'Expat';
	}
	elsif ( $licensetext
		=~ /Permission is hereby granted, without written agreement and without license or royalty fees, to use, copy, modify, and distribute this software and its documentation for any purpose/
		)
	{
		$license = "MIT/X11 (BSD like) $license";
		push @spdx_license, 'Expat';
	}
	if ( $licensetext
		=~ /Permission to use, copy, modify, and(\/or)? distribute this software for any purpose with or without fee is hereby granted, provided.*copyright notice.*permission notice.*all copies/
		)
	{
		$license = "ISC $license";
		push @spdx_license, 'ISC';
	}
	if ( $licensetext
		=~ /THIS SOFTWARE IS PROVIDED .*AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY/
		)
	{
		if ( $licensetext
			=~ /All advertising materials mentioning features or use of this software must display the following acknowledge?ment.*This product includes software developed by/i
			)
		{
			$license = "BSD (4 clause) $license";
			push @spdx_license, 'BSD-4-Clause';
		}
		elsif ( $licensetext
			=~ /(The name(?:\(s\))? .*? may not|Neither the (names? .*?|authors?) nor the names of( (its|their|other|any))? contributors may) be used to endorse or promote products derived from this software/i
			)
		{
			$license = "BSD (3 clause) $license";
			push @spdx_license, 'BSD-3-Clause';
		}
		elsif ( $licensetext
			=~ /Redistributions in binary form must reproduce the above copyright notice/i
			)
		{
			$license = "BSD (2 clause) $license";
			push @spdx_license, 'BSD-2-Clause';
		}
		else {
			$license = "BSD $license";
			push @spdx_license, 'BSD';
		}
	}
	elsif ( $licensetext =~ /licen[sc]ebsd(?:-(\d)-clause)?/i ) {
		if ($1) {
			$license = "BSD ($1 clause) $license";
			push @spdx_license, "BSD-$1-Clause";
		}
		else {
			$license = "BSD $license";
			push @spdx_license, "BSD";
		}
	}
	if ( $licensetext
		=~ /Mozilla Public License,? (?:(?:Version|v\.)\s+)?(\d+(?:\.\d+)?)/ )
	{
		$license = "MPL (v$1) $license";
		push @spdx_license, "MPL-$1";
	}
	elsif ( $licensetext
		=~ /Mozilla Public License,? \((?:Version|v\.) (\d+(?:\.\d+)?)\)/ )
	{
		$license = "MPL (v$1) $license";
		push @spdx_license, "MPL-$1";
	}

# match when either:
# - the text *begins* with "The Artistic license v2.0" which is (hopefully) the actual artistic license v2.0 text.
# - a license grant is found. i.e something like "this is free software, licensed under the artistic license v2.0"
	if ( $licensetext
		=~ /(?:^\s*|(?:This is free software, licensed|Released|be used|use and modify this (?:module|software)) under (?:the terms of )?)[Tt]he Artistic License ([v\d.]*\d)/
		)
	{
		$license = "Artistic (v$1) $license";
		push @spdx_license, "Artistic-$1";
	}
	if ( $licensetext =~ /is free software under the Artistic [Ll]icense/ ) {
		$license = "Artistic $license";
		push @spdx_license, 'Artistic';
	}
	if ( $licensetext
		=~ /This program is free software; you can redistribute it and\/or modify it under the same terms as Perl itself/
		)
	{
		$license = "Perl $license";
		push @spdx_license, 'Artistic', 'GPL';
	}
	if ( $licensetext =~ /under the Apache License, Version ([^ ]+)/ ) {
		$license = "Apache (v$1) $license";
		push @spdx_license, "Apache-$1";
	}
	if ( $licensetext =~ /(THE BEER-WARE LICENSE)/i ) {
		$license = "Beerware $license";
		push @spdx_license, 'Beerware';
	}
	if ( $licensetext
		=~ /distributed under the terms of the FreeType project/i )
	{
		$license = "FreeType $license";
		push @spdx_license, 'FTL';
	}
	if ( $licensetext
		=~ /This source file is subject to version ([^ ]+) of the PHP license/
		)
	{
		$license = "PHP (v$1) $license";
		push @spdx_license, "PHP-$1";
	}
	if ( $licensetext =~ /under the terms of the CeCILL-([^ ]+) / ) {
		$license = "CeCILL-$1 $license";
		push @spdx_license, "CECILL-$1";
	}
	elsif ( $licensetext =~ /under the terms of the CeCILL / ) {
		$license = "CeCILL $license";
		push @spdx_license, 'CECILL';
	}
	if ( $licensetext =~ /under the SGI Free Software License B/ ) {
		$license = "SGI Free Software License B $license";
		push @spdx_license, 'SGI-B';
	}
	if ( $licensetext =~ /is in the public domain/i ) {
		$license = "Public domain $license";
		push @spdx_license, 'public-domain';    # not listed by SPDX
	}
	if ( $licensetext
		=~ /terms of the Common Development and Distribution License(, Version ([^(]+))? \(the License\)/
		)
	{
		$license = "CDDL " . ( $1 ? "(v$2) " : '' ) . $license;
		push @spdx_license, 'CDDL' . ( $1 ? "-$2" : '' );
	}
	if ( $licensetext =~ /Microsoft Permissive License \(Ms-PL\)/ ) {
		$license = "Ms-PL $license";
		push @spdx_license, 'MS-PL';
	}
	if ( $licensetext
		=~ /Licensed under the Academic Free License version ([\d.]+)/ )
	{
		$license = $1 ? "AFL-$1" : "AFL";
		push @spdx_license, 'AFL' . ( $1 ? "-$1" : '' );
	}
	if ( $licensetext
		=~ /This program and the accompanying materials are made available under the terms of the Eclipse Public License v?([\d.]+)/
		)
	{
		$license = $1 ? "EPL-$1" : "EPL";
		push @spdx_license, 'EPL' . ( $1 ? "-$1" : '' );
	}

	# quotes were removed by clean_comments function
	if ( $licensetext
		=~ /Permission is hereby granted, free of charge, to any person or organization obtaining a copy of the software and accompanying documentation covered by this license \(the Software\)/
		or $licensetext
		=~ /Boost Software License([ ,-]+Version ([^ ]+)?(\.))/i )
	{
		$license = "BSL " . ( $1 ? "(v$2) " : '' ) . $license;
		push @spdx_license, 'BSL' . ( $1 ? "-$2" : '' );
	}
	if ( $licensetext
		=~ /PYTHON SOFTWARE FOUNDATION LICENSE (VERSION ([^ ]+))/i )
	{
		$license = "PSF " . ( $1 ? "(v$2) " : '' ) . $license;
		push @spdx_license, 'Python' . ( $1 ? "-$2" : '' );
	}
	if ( $licensetext
		=~ /The origin of this software must not be misrepresented.*Altered source versions must be plainly marked as such.*This notice may not be removed or altered from any source distribution/
		or $licensetext =~ /see copyright notice in zlib\.h/ )
	{
		$license = "zlib/libpng $license";
		push @spdx_license, 'Zlib';
	}
	elsif ( $licensetext =~ /This code is released under the libpng license/ )
	{
		$license = "libpng $license";
		push @spdx_license, 'Libpng';
	}
	if ( $licensetext
		=~ /Do What The Fuck You Want To Public License, Version ([^, ]+)/i )
	{
		$license = "WTFPL (v$1) $license";
		push @spdx_license, "WTFPL-$1";
	}
	if ( $licensetext =~ /Do what The Fuck You Want To Public License/i ) {
		$license = "WTFPL $license";
		push @spdx_license, "WTFPL";
	}
	if ( $licensetext =~ /(License WTFPL|Under (the|a) WTFPL)/i ) {
		$license = "WTFPL $license";
		push @spdx_license, "WTFPL";
	}
	$license = "UNKNOWN" if ( !length($license) );
	push @spdx_license, "UNKNOWN" unless @spdx_license;

	# Remove trailing spaces.
	$license =~ s/\s+$//;
	return $self->deb_fmt ? join( ' or ', @spdx_license ) : $license;
}

=encoding UTF-8

=head1 AUTHOR

Adam D. Barratt C<< <adam@adam-barratt.org.uk> >>

=head1 COPYRIGHT AND LICENSE

This program was originally based on the script "licensecheck" from the
KDE SDK (by C<< <dfaure@kde.org> >>).

This version is
  Copyright © 2007, 2008 Adam D. Barratt
  Copyright © 2012 Francesco Poli

This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 2 of the License, or (at your
option) any later version.

This program is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program. If not, see <https://www.gnu.org/licenses/>.

=cut

1;
