use 5.008001;
use strict;
use warnings;
use utf8;
use re (qw/eval/);

package String::Copyright;

=encoding UTF-8

=head1 NAME

String::Copyright - Representation of text-based copyright statements

=head1 VERSION

Version 0.001004

=cut

our $VERSION = '0.001004';

# Dependencies
use Exporter 5.57 (qw/import/);
use Carp ();

our @EXPORT = qw/copyright/;

use constant {
	PLAINTEXT => 0,
	BLOCKS    => 1,
};

use overload (

#	q{@{}}   => sub { ${$_[0]->[BLOCKS]} },
#	q{@{}}   => sub { ( $_[0]->[BLOCKS] ) },
#	q{@{}}   => sub { $_[0]->[BLOCKS] },
#	q{0+}    => sub { 0+$_[0]->[BLOCKS] },
	q{""}    => sub { $_[0]->_compose },
	fallback => 1,
);

=head1 SYNOPSIS

    use String::Copyright;

    my $copyright = copyright(<<'END');
    copr. © 1999,2000 Foo Barbaz <fb@acme.corp> and Acme Corp.
    Copyright (c) 2001,2004 Foo (work address) <foo@zorg.corp>
    Copyright 2003, Foo B. and friends
    © 2000, 2002 Foo Barbaz <foo@bar.baz>
    END

    print $copyright;

    # Copyright 1999-2000 Foo Barbaz <fb@acme.com> and Acme Corp.
    # Copyright 2000, 2002 Foo Barbaz and Acme Corp.
    # Copyright 2001, 2004 Foo (work address) <foo@zorg.org>
    # Copyright 2003 Foo B. and friends

    print $copyright->normalize(
      alias  => {
        [ 'foo@bar.baz' => [ 'fb@acme.com', 'foo@zorg.org'] ] }
      mangle => {
        [ 's/Foo Barbaz\K(?= and .*)$/ <foo@bar.baz>/' ] }
    );

    # Copyright 1999-2000, 2002-2003 Acme Corp.
    # Copyright 1999-2004 Foo Barbaz <foo@bar.baz>
    # Copyright 2003 Foo B. and friends

=head1 DESCRIPTION

L<String::Copyright> Parses common styles of copyright statements
and serializes in normalized format.

=cut

my $hspace_re = qr/[ \t]/;
my $sign_re
	= qr/copyright(?:-holders?)?|copr\.|[©⒞Ⓒⓒ🄒🄫🅒]|\(c\)/i;
my $bogus_sign_re = qr/^[#]define$hspace_re(?:\S|$hspace_re)*\(c\)/i;

my $year_re  = qr/\b[0-9]{4}\b/;
my $comma_re = qr/[,\s]*/;
my $dash_re  = qr/$hspace_re*[-˗‐‑‒–—―⁃−﹣－]$hspace_re*/;

# TODO: test if \K or non-backref beneficial on perl >= 5.10
#my $comma_tidy_re = qr/$year_re\K$comma_re(?=$year_re)/;
#my $comma_tidy_re = qr/($year_re)$comma_re(?=$year_re)/;
my $comma_tidy_re = qr/(?<=$year_re)$comma_re(?=$year_re)/;

# TODO: test if \K beneficial on perl >= 5.10
#my $year_begin_re = qr/(?:\A|(?<!-))($year_re)\K/;
our $y;
my $y_save_re   = qr/(?:(?<=[^-]($year_re))|(?<=\A($year_re)))(?{$y=$^N})/;
my $y_next_re   = qr/(??{++$y})/;
my $y_future_re = qr/(??{if($y<=$^N){$y=$^N;'';}else{'XXXX';}})/;
my $years_tidy_re
	= qr/$y_save_re(?:(?:$dash_re|$comma_re)($y_next_re)\b|-($year_re)$y_future_re\b)+/;

my $bogus_owner_re
	= qr/\b(?:info(?:rmation)?(?!@)|(?:notice|statement|claim|string|holder|owner)s?|is|in|to|ownership)\b/i;

my $years_re = qr/$year_re(?:(?:$dash_re|$comma_re)$year_re)*/;
my $owners_re
	= qr/((?:\bby\b\s*)?((?:(?!$years_re|$bogus_owner_re)\S(?:(?!$bogus_owner_re)\S|$hspace_re)*?)?))\s*$/;

my $years_owners_re = qr/\s*((?:$years_re)?)$comma_re$owners_re/;
my $sign_years_owners_re
	= qr/^(?:(?!$sign_re|$bogus_sign_re)\S|$hspace_re)*?$sign_re(?:$hspace_re+$sign_re)*(?::|$hspace_re)$years_owners_re/;

sub copyright
{
	my $copyright = shift;
	Carp::croak("String::Copyright strings require defined parts")
		unless 1 + @_ == grep {defined} $copyright, @_;

	# String::Copyright objects are effectively immutable and can be reused
	if ( !@_ && ref($copyright) eq __PACKAGE__ ) {
		return $copyright;
	}

	# stringify objects
	$copyright = "$copyright";

	# TODO: also parse @_ - but each separately!
	my $blocks = parse_string( split /^/, $copyright );

	bless [ $copyright, $blocks ], __PACKAGE__;
}

sub new
{
	my ( $self, @data ) = @_;
	Carp::croak("String::Copyright require defined, positive-length parts")
		unless 1 + @_ == grep { defined && length } @data;

	# String::Copyright objects are simply stripped of their string part
	if ( !@_ && ref($self) eq __PACKAGE__ ) {
		return bless [ undef, $data[1] ], __PACKAGE__;
	}

	# FIXME: properly validate data
	Carp::croak("String::Copyright blocks must be an array of strings")
		unless @_ == grep { ref eq 'ARRAY' } @data;

	bless [ undef, \@data ], __PACKAGE__;
}

sub blocks { $_[0]->[BLOCKS] }

sub normalize
{
	my ( $self, @opts ) = @_;
	Carp::confess("normalize options not yet implemented")
		if @opts;

	new($self);
}

sub _compose
{
	join "\n", map { '© ' . $_ } @{ $_[0]->[BLOCKS] };
}

sub is_normalized { !defined $_[0]->[PLAINTEXT] }

sub parse_string
{
	my @block;
	my $skipped_lines = 0;

	while (@_) {
		my ( $years, $owners_dirty, $owners )
			= shift =~ /$sign_years_owners_re/;

		if ( not( length $years or length $owners ) ) {

 # skip remaining lines if a copyright blocks was found more than 5 lines ago.
 # so a copyright block may contain up to 5 blank lines, but no more
			last if ( @block and $skipped_lines++ > 5 );
			next;
		}

		# extend block to next line(s) if only years found so far
		while ( @_ and length $years and not length $owners_dirty ) {
			( my $newyears, $owners_dirty, $owners )
				= shift =~ /^$years_owners_re/;
			$years .= ' ' . $newyears if length $newyears;
		}

		# normalize
		$years =~ s/$comma_tidy_re/, /g;
		$years =~ s/$years_tidy_re/-$^N/g;
		$owners =~ s/\s{2,}/ /g;

		my $copyright_match
			= ( length $years and length $owners )
			? $years . ' ' . $owners
			: $years . $owners;

# split owner into owner_id and owner

		push @block, $copyright_match;
	}

# TODO: save $skipped_lines to indicate how dirty parsing was

	return \@block;
}

=head1 SEE ALSO

=over 4

=item *

L<Encode>

=back

=head1 BUGS/CAVEATS/etc

L<String::Copyright> operates on strings, not bytes.
Data encoded as UTF-8, Latin1 or other formats
need to be decoded to strings before use.

Only ASCII characters and B<©> (copyright sign) are directly processed.

If copyright sign is mis-detected
or accents or multi-byte characters display wrong,
then most likely the data was not decoded into a string.

If ranges or lists of years are not tidied,
then maybe it contained non-ASCII whitespace or digits.

=head1 AUTHOR

Jonas Smedegaard C<< <dr@jones.dk> >>

=head1 COPYRIGHT AND LICENSE

Derived from L<App::Licensecheck> originally part of the KDE SDK,
originally introduced by Stefan Westerfeld C<< <stefan@space.twc.de> >>;
and on the script licensecheck2dep5 part of Debian CDBS tool,
written by Jonas Smedegaard.

  Copyright © 2007, 2008 Adam D. Barratt

  Copyright © 2005-2012, 2016 Jonas Smedegaard

This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 3, or (at your option) any
later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program. If not, see <https://www.gnu.org/licenses/>.

=cut

1;
