#!/usr/bin/perl use strict; use warnings; use warnings FATAL => qw(uninitialized); # Usage: # wget http://download.wikimedia.org/jawiki/20070224/jawiki-20070224-stub-meta-history.xml.gz # gunzip -c jawiki-20070224-stub-meta-history.xml.gz | ./users our %user; { package Parse; use strict; use warnings; use warnings FATAL => qw(uninitialized); my $mediawiki_ns = 'http://www.mediawiki.org/xml/export-0.3/'; my $qname_mediawiki; my $qname_page; my $qname_revision; my $qname_id; my $qname_timestamp; my $qname_contributor; my $qname_username; my $r_revision; my $revision_count; sub StartDocument { my ($expat) = @_; $r_revision = undef; $revision_count = 0; $qname_mediawiki = $expat->generate_ns_name('mediawiki', $mediawiki_ns); $qname_page = $expat->generate_ns_name('page', $mediawiki_ns); $qname_revision = $expat->generate_ns_name('revision', $mediawiki_ns); $qname_id = $expat->generate_ns_name('id', $mediawiki_ns); $qname_timestamp = $expat->generate_ns_name('timestamp', $mediawiki_ns); $qname_contributor = $expat->generate_ns_name('contributor', $mediawiki_ns); $qname_username = $expat->generate_ns_name('username', $mediawiki_ns); print STDERR 'Reading file...'; } sub StartTag { my ($expat, $tag) = @_; my @context = $expat->context; if ($expat->depth == 2 && $expat->eq_name($context[0], $qname_mediawiki) && $expat->eq_name($context[1], $qname_page) && $expat->eq_name($tag, $qname_revision)) { $r_revision = {}; } } sub EndTag { my ($expat, $tag) = @_; if ($r_revision && $expat->depth == 2) { if (exists $r_revision->{'userid'}) { my $id = $r_revision->{'id'}; my $timestamp = $r_revision->{'timestamp'}; my $username = $r_revision->{'username'}; my $userid = $r_revision->{'userid'}; my $r_user = $user{$userid}; if (!defined($r_user)) { $user{$userid} = $r_user = [0, 0, 0, undef]; } if ($timestamp lt '2007-01-11T09:40:00Z') { # push @revision, $r_revision; if ($timestamp lt '2006-12-11T09:40:00Z') { $r_user->[0]++; } else { $r_user->[1]++; } } if ($id > $r_user->[2]) { $r_user->[3] = $username; } } $r_revision = undef; $revision_count++; if ($revision_count % 20000 == 0) { print STDERR $revision_count, '...'; } } } sub Text { my ($expat) = @_; if ($r_revision) { my @context = $expat->context; if ($expat->depth == 4 && $expat->eq_name($context[3], $qname_id)) { $r_revision->{'id'} = $_; } elsif ($expat->depth == 4 && $expat->eq_name($context[3], $qname_timestamp)) { $r_revision->{'timestamp'} = $_; } elsif ($expat->depth == 5 && $expat->eq_name($context[3], $qname_contributor) && $expat->eq_name($context[4], $qname_username)) { $r_revision->{'username'} = $_; } elsif ($expat->depth == 5 && $expat->eq_name($context[3], $qname_contributor) && $expat->eq_name($context[4], $qname_id)) { $r_revision->{'userid'} = $_; } } } sub PI { my ($expat, $target, $data) = @_; # Do nothing } sub EndDocument { my ($expat) = @_; if ($revision_count % 20000 != 0) { print STDERR $revision_count, '...'; } $qname_mediawiki = undef; $qname_page = undef; $qname_revision = undef; $qname_id = undef; $qname_timestamp = undef; $qname_contributor = undef; $qname_username = undef; print STDERR "done.\n"; } } package main; use XML::Parser; my $dump_file = shift; if (!defined($dump_file)) { $dump_file = '-'; } binmode STDOUT, ':utf8'; my $parser = new XML::Parser(Namespaces => 1, Style => 'Stream', Pkg => 'Parse', ErrorContext => 5); $parser->parsefile($dump_file); my @userid; my ($userid, $r_user); while (($userid, $r_user) = each %user) { my ($old_edits, $new_edits, $last_edit, $username) = @$r_user; my $total_edits = $old_edits + $new_edits; if ($old_edits >= 1 && $total_edits >= 50 && $new_edits >= 5) { print "$username (ID $userid; $total_edits edits, $new_edits recent)\n"; } } # This code is in public domain.