User:Philosobot/Source code/phillists/update philosophy.pl

Source: Wikipedia, the free encyclopedia.
#!/usr/bin/perl
use strict;		      # 'strict' insists that all variables be declared
use diagnostics;	      # 'diagnostics' expands the cryptic warnings
use open 'utf8';

use lib $ENV{HOME} . '/public_html/wp/modules'; # path to perl modules
require 'bin/perlwikipedia_utils.pl'; # my own packages, this and the one below
require 'bin/fetch_articles.pl';
require 'bin/rm_extra_html.pl';
require 'strip_accents_and_stuff.pl';
require 'lists_utils.pl';
undef $/; # undefines the separator. Can read one whole file in one scalar.

# Collect the philosophy articles from the philosophy categories. Merge them into the [[Index of philosophy]] on Wikipedia.
# Remove redlinks, redirects, and disambig pages. Submit to Wikipedia the log of changes and newly detected categories. This runs daily.

MAIN: {

$| = 1; # flush the buffer each line

my ($line, @lines, %articles, $letter, %blacklist, @articles_from_cats, $text, $file, $sleep, $attempts, $edit_summary, $todays_log);
my ($list_of_categories, @letters, @philosophy_categories, @philosopher_categories, @other_categories, $log_file, $count);
my ($articles_from_cats_file, $all_phil_arts_file, @new_categories, %current_categories, %all_articles, $philosophers_logfile, $prefix, $Editor);
@letters=("0-9", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z");

# Files involved (they are many).
$list_of_categories='List_of_philosophy_categories.wiki';
$log_file="User:Philosobot/Changes_to_phillists.wiki";

# The files with the .txt extension are local, they don't get submitted to Wikipedia.
$articles_from_cats_file='All_philosophy_from_cats.txt';
$all_phil_arts_file='All_philosophy.txt';
$philosophers_logfile='Philosophers_log.txt';

$prefix = "Index of philosophy";

$sleep = 5; $attempts=500; # necessary to fetch data from Wikipedia and submit
$Editor=wikipedia_login();

# Get today's articles found in categories
read_categories_from_list(\@philosophy_categories,\@philosopher_categories,\@other_categories,
			     $list_of_categories);
fetch_articles(\@philosophy_categories, \@articles_from_cats, \@new_categories);
@articles_from_cats=randomize_array(@articles_from_cats); # to later identify entries differning only by capitals

# articles which we will not allow in the philosophy list for various reasons
put_redlinks_on_blacklist($prefix, \@letters, \%blacklist);
put_philosophers_on_blacklist_and_user_selected_also(\%blacklist);
put_redirects_on_blacklist(\%blacklist, $articles_from_cats_file, \@articles_from_cats);

# go letter by letter, and merge the new entries
foreach $letter (@letters){
$file = "$prefix ($letter).wiki";

$text=wikipedia_fetch($Editor, $file, $attempts, $sleep);  # fetch the lists from Wikipedia
exit (0) if $text =~ /^\s*$/;                      # quit if can't get any of the lists

# the heart of the code
$text = merge_new_entries_from_categories($letter, $text, \@articles_from_cats, \%blacklist, \%all_articles);

$edit_summary="Daily update. See the log at [[User:Philosobot/Changes to phillists]].";
wikipedia_submit($Editor, $file, $edit_summary, $text, $attempts, $sleep);

}
post_newly_detected_categories(\@philosophy_categories, \@philosopher_categories, \@other_categories, \@new_categories);

# create the log of changes to the phil articles. Merge with the changes to philosopher articles. Submit.
$todays_log=process_log_of_todays_changes(\%all_articles, \%blacklist, $all_phil_arts_file); # changes to the philosophy articles
open(FILE, '<', $philosophers_logfile); $text=<FILE>; close(FILE);
$text =~ s/^==.*?==\s*//g; $text =~ s/(^|\n)(:.)/"$1: Philosophers" . lc($2)/eg;
$todays_log = $todays_log . "----\n" . $text;
merge_logs_and_submit($todays_log, $log_file);
}

# articles which we will not allow in the [[Index of philosophy]]
sub put_philosophers_on_blacklist_and_user_selected_also {
my $blacklist=shift;
my ($line, @lines);

# read blacklist from file
open (FILE,  '<', "User:Philosobot/Blacklist.wiki");      @lines = split ("\n", <FILE>); close(FILE);
foreach $line (@lines) {
next unless $line =~ /\[\[(.*?)\]\]/;
$line = $1; $line =~ s/^(.)/uc($1)/eg; # upcase
$blacklist->{$line}= '(is in [[User:Philosobot/Blacklist]])';
}

# blacklist the philosophers (which already are in the [[Index of philosophers]])
open (FILE,  '<', "All_philosophers.txt");  @lines = split ("\n", <FILE>);  close(FILE);
foreach $line (@lines) {
$blacklist->{$line}= '(is in the [[Index of philosophers]])';
}
}

# the heart of the code
sub merge_new_entries_from_categories{

my ($link, $link_stripped, @links, %articles);
my ($letter, $text, $articles_from_cats, $blacklist, $all_articles)=@_;

$text = rm_extra_html($text); # replace &amp; with &, etc. This was needed for one run only I think.

@links=split("\n", $text);
foreach $link (@links){
if ($link =~ /^\[\[(.*?)(\||\]\])/){ # extract the link
$link=$1;
}else{
$link="";
}
}
@links=(@links, @$articles_from_cats); # append the randomized @articles_from_cats to @links

# put into hash the entries starting with current letter
foreach $link (@links){

next if exists $blacklist->{$link};  # don't add blacklisted items to the list of topics
next if $link =~ /(talk|wikipedia|template|category|user):/i;  # ignore talk pages, templates, etc
next if $link =~ /Index of philosophy articles \(/i; # do not put links to lists themselves, that's stupid
next if $link =~ /^\s*$/; # ignore empty links

# Get a copy of the link stripped of accents and non-alphanumberic.
# Will use it for sorting.
$link_stripped = strip_accents_and_stuff ($link);

# now, do not deal with any articles except the current letter
if ($letter eq "0-9"){
next unless $link_stripped =~ /^\d/;
}else{
next unless $link_stripped =~ /^$letter/i;
}

$articles{$link_stripped} = "\[\[$link\]\] \[\[Talk:$link\| \]\] -- "; # put them all in a hash
$all_articles->{$link}=1; # this will be exported out of this function
}

# split into sections and collect all data in $text
split_into_sections (\%articles);
$text="__NOTOC__\n{{PhilTopicTOC}}\n";
foreach (sort keys %articles) {
$text .= $articles{$_} . "\n";
}
$text .= "\n[[Category:Philosophy-related lists|Philosophy $letter]]\n[[Category:Indexes of articles|Philosophy $letter]]\n";
return $text;
}

sub post_newly_detected_categories {

my ($philosophy_categories, $philosopher_categories, $other_categories, $new_categories)=@_;
my (%current_categories, $text, $line, $philosopher_cat_list, $sleep, $attempts, $edit_summary, $file, $Editor);

# add to the newly discovered philosophy categories the philosopher categories discovered when running that script
$philosopher_cat_list = "New_philosopher_categories.txt";
open (FILE, "<", $philosopher_cat_list); $text =  <FILE>; close(FILE);
push (@$new_categories, split ("\n", $text));

# current categories
foreach $line (@$philosophy_categories  ){ $current_categories{$line}=1;  }
foreach $line (@$philosopher_categories){ $current_categories{$line}=1;  }
foreach $line (@$other_categories        ){ $current_categories{$line}=1;  }

# see which of the @$new_categories are truly new
$text="";
foreach $line (@$new_categories){
next if exists $current_categories{$line};
next unless $line =~ /Category:/;
$text .= "\[\[:$line\]\] -- \n";
}

$file              = "User:Philosobot/New_phil_categories.wiki";
$Editor=wikipedia_login();
$sleep = 5; $attempts=500;  $edit_summary="Today's new philosophy categories.";
wikipedia_submit($Editor, $file, $edit_summary, $text, $attempts, $sleep);
}

sub merge_logs_and_submit{

my ($log_file, $todays_log, $combined_log, @days, $sleep, $attempts, $edit_summary);
my ($Editor);
($todays_log, $log_file)=@_;

# Read in the log from previous days (from the disk), append to it today's log
open (FILE, '<', $log_file); $combined_log=<FILE>; close(FILE);
$combined_log =~ s/(^.*?\n)(==.*?)$/$1$todays_log\n$2/sg; #

# keep only the last month or so
@days = split ("\n==", $combined_log);
splice (@days, 39);
$combined_log = join ("\n==", @days);

# submit the log file, and write the logfile back to disk (away from wikipedia vandals)
$Editor=wikipedia_login();
$sleep = 5; $attempts=500; $edit_summary="Today's changes to the [[Index of philosophy]].";
wikipedia_submit($Editor, $log_file, $edit_summary, $combined_log, $attempts, $sleep);
open (FILE, '>', $log_file); print FILE "$combined_log\n"; close(FILE); # write new log to disk
}