#!/usr/bin/perl use utf8; use Irssi; use 5.10.0; use vars qw($VERSION %IRSSI); use Encode qw/encode decode/; use HTML::Entities; use LWP::Simple; use LWP::UserAgent; # use warnings; # use strict; # print join ("\n",@INC); $VERSION = '0.01'; %IRSSI = ( authors => '', contact => '', name => 'pagetitle', description => 'head website title in #ubuntu-cn.', license => '', ); our $domain_suffix; open FILE, "<", "/home/alvin/.irssi/scripts/domain_suffix.txt"; $domain_suffix = ; chomp $domain_suffix; while () { chomp $_; $domain_suffix .= "|$_"; } close FILE; our $time_stamp; our $flag; our @nick_array; # store: "nick time" seperated with a space our $last; $time_stamp = time - 10; $flag = 0; our $ua; our $bot = 1; # bot online, 1; else 0 $ua = LWP::UserAgent->new; $ua->agent('Mozilla/5.0 (X11; Linux i686; rv:12.0) Gecko/20100101 Firefox/12.0 Iceweasel/12.0'); $ua->timeout(5); $ua->max_size(10240/2); # some sitei's title is after 1024bytes sub check_bot { my $server; eval {$server = Irssi::server_find_tag("FREENODE") }; return if $server->{tag} !~ /freenode/i; my $chan = $server->channel_find("#ubuntu-cn"); eval { $bot = 0; foreach ($chan->nicks()) { if ($_->{host} ~~ /unaffiliated\/kves$/) { $bot = 1; last; } } }; return $bot; } sub delete_flood { my $t; my @tmp = @nick_array; @nick_array = undef; foreach (@tmp) { $_ =~ / (\S+)$/; $t = $1; push @nick_array, $_ if (time - $t <= 90); } } sub check_flood { my ($nick) = @_; my $count; my ($n, $t); #nick, time my $rt; $rt = 0; $count = 0; foreach (@nick_array) { $_ =~ /^(\S+) (\S+)$/; $n = $1; $t = $2; $count++ if (($n eq $nick) and (time - $t <= 90)); } $rt = 1 if ($count >= 4); return $rt; } sub fetch_header { Irssi::signal_continue(@_); my ($server, $msg, $nick, $address, $target) = @_; my $url; my $base_url; #true base url my $base_url2; #maybe fake, like url shorter my $title; my $meta; my $enc; my $resp; my $content; my @escape; my $tmp; return if $server->{tag} !~ /freenode/i; if ($nick =~ /(\^k\^|\[ub\]|kk|sevk)/i) { # print "Fetch Title: ^k^'s statement"; return; } return if ($target !~ '#ubuntu-cn'); { $msg =~ /([a-zA-Z0-9][\x21-\x7E]*?(\.\b($domain_suffix)\b)+[\x21-\x7E]*)/i; # $msg =~ /([\x21-\x7E]*?(\.\b($domain_suffix)\b)+[\x21-\x7E]*)/i; $url = $1; unless (defined $url) { $msg =~ /([a-zA-Z0-9]?[\x21-\x7E]*\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\/[\x21-\x7E]*)/i; $url = $1; return if $url =~ /[^.]*192\.168\./; return if $url =~ /[^.]*127\.0\./; } return unless (defined $url); } return if ($url =~ /@.*?\.\b($domain_suffix)\b/i); # e-mail &check_bot; return if ($bot == 1 and $url =~ /^http/i); $url = "http://" . $url unless ($url ~~ /^http/i or $url ~~ /^ftp/i); { $url =~ /([^\/.]+?\.\b($domain_suffix)\b[^\/]*)/i; $base_url2 = $1; } push @nick_array, "$nick " . time; if (&check_flood($nick) == 1) { print "Fetch Title: flood detected."; return; } &delete_flood; my $retry = 3; do { $resp = $ua->get($url); $retry--; } while ((not $resp->is_success) and $retry); if ($resp->is_success ne 1) { print "Fetch Title: not success $url ", $resp->status_line; return; } { $resp->base =~ /([^\/.]+?\.\b($domain_suffix)\b[^\/]*)/i; $base_url = $1; if (not $base_url) { $url =~ /([^\/.]+?\.\b($domain_suffix)\b[^\/]*)/i; $base_url = $1; } if ($base_url ne $base_url2) { $base_url = "$base_url <~ $base_url2" } } ($content, $enc) = split ";", $resp->header("Content-Type"); if ($content ne 'text/html') { return; } { $enc =~ s/.*?=(.*)/$1/; } { $resp->decoded_content(charset => "$enc") =~ m/(.*?)<\/title>/gsmi; if ((not defined $1) or $1 ~~ /^\s+$/) { # print "Fetch Title: $url can't get title"; return; } $title = $1; } $title =~ s/(\n|\r)//gmi; $title =~ s/\s+/ /gmi; $title =~ s/^\s*(.+?)\s*$/$1/gmi; if ($title =~ /\b(porn|sex|pussy|cock|eroti)/img) { print "Fetch Title: forbidden words"; # $title = "..xxXXxx.."; return; } $title = "*HTTPS* " . $title if ($resp->base =~ /^https/i or $url =~ /^https/i); @escape = ($title =~ m/(&#?[a-zA-Z0-9]+;)/gm); foreach (@escape) { $tmp = decode_entities($_); eval { $tmp = decode("iso-8859-1", $tmp); }; $title =~ s/$_/$tmp/g; } $title = substr($title, 0, 96) . " ..." if (length $title > 100); print "Fetch Title: \$bot = $bot URL: $url base: $base_url ENCl: $enc TITLE: $title"; return if ($title eq "" or not defined $title); if ($flag == 0) { $flag = 1; if (time - $time_stamp > 120) { $server->command("msg $target Title: $title (@ $base_url)"); $time_stamp = time; $last = $title; } elsif (time - $time_stamp > 10 and $title ne $last) { $server->command("msg $target Title: $title (@ $base_url)"); $time_stamp = time; $last = $title; } $flag = 0; } } Irssi::signal_add_last('message public', 'fetch_header'); # Irssi::signal_add_last('message join', 'check_bot'); # Irssi::signal_add_last('message part', 'check_bot'); # Irssi::signal_add_last('message quit', 'check_bot'); # Irssi::signal_add_last('message kick', 'check_bot'); # Irssi::signal_add_last('message nick', 'check_bot'); Irssi::signal_add_last('channel sync', 'check_bot');