# ---------------------------------------------------------- # TextOCR scanner and image validator SA-plugin v. 3.1 # Written by M. Blapp, ImproWare AG, Switzerland # ---------------------------------------------------------- # # README: # ------- # # textocr.pm is a plugin for spamassassin 3.1+ to detect # suspect pictures and extract text from them with gocr. # The OCR dictionary functionaliy has been replaced with # regexes. The plugin can also verify the validity of the # pictures and detects spoofing of the content type. # # # HISTORY: # -------- # # 31.03.2006, v. 1.00 # # Initial revision # # 01.04.2006, v. 1.01 # # Added more words to scanlist # 02.04.2006, v. 1.02 # # Check return values of netpbm utils # # 03.04.2006, v 1.1 # # Remove the eval function and replace # it with parsed_metadata(). Now we can # track errors, count the words we found. # The plugin detects now forged content type # entries. # # 03.04.2006, v 1.11 # # Add a check for suspect pictures and add some # score for it. There are new pics going around # with obfuscated content so ocr scanners are useless # again :-( # # 04.04.2006, v. 1.2 # # The GIF module from Image::ExifTool doesn't recognize # GIFs without colortable as valid pics and just skips them. # The result is a matching SPAMPIC_BROKEN_GIF entry which is # wrong. You should definitly patch your Image::ExifTool installation # with the provided patch at http://antispam.imp.ch/patches/patch-GIF-Colortable # # 04.04.2006, v. 1.2.1 # # Don't scan small pictures, even not for header parsing as it # seems Image::ExifTool has again problems with this. Fix the # size calculations. # # 04.04.2006, v. 1.2.2 # # Count the non standard Image::ExifTool failures as soft errors # and add NONSTD_ tests for them. # # 08.04.2006, v 1.3 # # Much more words to scan for, added a second method to scan jpeg # pics which helps with pics having white font and a lot of noisy # distorts. Rename the SUSPECT_ tests and lower the scores for # them. # # 08.04.2006, v 1.3.1 # # Added two other jpeg scanmethods which give a higher match # possibility. # # 12.04.2006, v 1.4 # # Added a timeout (default 10 seconds) and change the scanmethods. # Now we scan also normalized pnm files, this seems to help a lot # on some jpegs. Removed some debug statements. # # 12.04.2006 v. 1.4.1 # # Add a scanlimit, only scan a limited number of images. # Fix a logical error, really redirect all error output to # stderr as I've implemented some time ago, but now it works. # # 12.04.2006 v. 1.4.2 # # Rename some vars to make it more logic, add new spamwords. # Add some perldoc documentation. Change minpixratio_ocr to # 4000 as there are more and more supect pics around. # # 14.04.2006 v. 1.5 # # Important change. Ignore raw pnm files if parsing has failed # or gocr dumped core (yes this can happen, I'll soon post # a fix for gocr). # # 14.04.2006 v. 1.6 # # Important change. Alter the whole plugin to use pipes and # kill stalled pids after we left the 'helper_run_mode'. Added # three count rules to count alpha nummeric chars. # # 14.04.2006 v. 1.6.1 # # Sort out identical chars. Some moirees and patterns are often found in # pictures and they show after a OCR scan repeated chars of the same # type. Not really a sign of words. Added some examples about the ALPHA rules. # # 06.06.2006 v. 1.6.2 # # Fix typo: pngtpnm -> pngtopnm. Now png pictures finally work too. # # 09.06.2006 v. 1.7 # # Add rules against multiple small pictures in HTML mails where # OCR is almost useless. # # 03.09.2006 v. 1.8 # # Add support for animated gifs. Mostly contributed by Romeo Benzoni. # Thanks a lot ! Add ~10 new rules. # # Important: You need now p5-Imager and libungif support. # # 08.09.2006 v. 1.9 # # Handle broken gif pictures and try to fix them if possible. I've # fixed some of the regexes and added a lot of new rules to match # the recent spams. # # 21.10.2006 v. 2.0 # # Catch the recent image spam with combined pictures and transparent # backgrounds, or images which have different offsets. Try to catch those # tricks all together. # # 22.10.2006 v. 2.1 # # Composed anims were not really correctly combined. Fix this issue. # # 26.10.2006 v. 2.2 # # Catch recent spampics with underline colors. Reorganize the plugin a bit. # Fix logic error introduced in v 2.1 # # 17.11.2006 v. 3.0 # # Add fuzzy string support, but match full and simple regex matches # still directly. Add a maximum score to still do OCR to prevent useless # picture scans. The wordlist is now a simple arrray at the top of the # config. # # Important: You need now the perl Module String::Approx. # # A lot of the new features have been borrowed by the # Fuzzy OCR Plugin (Thanks Christian !) # # 1.12.2006 v. 3.1 # # Changed ocrtext_minpixels_ocr to need only 20000 pixel pictures. # Changed priority to 100, allowing metatests which did not work # previously. # Added ocrtext_pwords, a list of positive words which give negative # counts. It's almost left empty since releasing this information would # give spammers a new opportunity. # # DESCRIPTION: # ------------ # # Scan suspect pictures and parse them with gocr. Very big and # small pictures are skipped. The suspicious word list has to # be defined in the spamassassin conf. # # # NOTICE: # ------- # # 'r' and 'n' are very similar, and many ocr programms often # can't make a difference between them. So just use '[rn] instead # of a single char. # # # INSTALLATION: # ------------- # # You'll need: # # - The perl module Perl-Imager. You need an already installed # libungif port. Please make sure gif pictures are really enabled. # # - Perl module Image::ExifTool and a patch for GIF pics: # http://antispam.imp.ch/patches/patch-GIF-Colortable # # - Perl module String::Approx # # - Gocr from http://jocr.sourceforge.net and a patch to # avoid segfaults with gocr: # http://antispam.imp.ch/patches/patch-gocr-segfault # # - Netpbm from http://netpbm.sourceforge.net # # - Libungif for fixgif and animated gif support. # # You can extract the plugin with 'patch < patch-ocrtext' # # Check if you have the necessary tools like giftopnm,jpegtopnm # pngtopnm, djpeg, pnminvert all in the same path. # --- /dev/null Fri Nov 17 10:46:25 2006 +++ ocrtext.cf Mon Nov 6 11:11:10 2006 @@ -0,0 +1,109 @@ +gocr_path /usr/local/bin/gocr +pnmtools_path /usr/local/bin +ocrtext_dscore 15 + +ocrtext_words realtime;alert;actquick;announce;headline;charts;increase;below;rating;takeoff;resource;ready::0.1;profit;news::0.1;wallstreet;free::0;pick::0.1;breaking;explosive;strong;spotlight;watch;symbol;stock;investor;offer;international;company;money::0;million;thousand;loose;buy;price::0.1;trade;worldtrade;target::0.1;higher;banking;service;recommendation;viagra;soma::0.1;cialis::0.1;xanax;valium;meridia::0.1;zanaflex;levitra;herbal::0.1;medicine;doctor;pills;legal;penis::0;erection::0.1;supplement;medication;weightloss;growth;drugs;pharmacy;prescription;click::0.1;here::0;software;kunde;volksbank;sparkasse;master;degree;bachelor;diploma;removal;visit;browser;readmore;type::0.1;cheap;shipping;quality;sideeffects;size::0.1;focused;replica::0.1;sale::0.1;bags::0.1;development;technology;expect;long-term;quick::0.1;afford;tradeout;compensate + +ocrtext_pwords information + +body OCRTEXT eval:ocrtext_check() +priority OCRTEXT 100 + +# +# Validate the GIF/PNG/JPEG pictures +# +body SPAMPIC_FORGED_CT eval:ocrtext_eval() +describe SPAMPIC_FORGED_CT Forged content-type in mime header +score SPAMPIC_FORGED_CT 3.000 + +body SPAMPIC_SUSPECT eval:ocrtext_eval() +describe SPAMPIC_SUSPECT Suspect image found +score SPAMPIC_SUSPECT 0.900 + +body GIFANIM_SUSPECT eval:ocrtext_eval() +describe GIFANIM_SUSPECT Suspect animated gif found +score GIFANIM_SUSPECT 2.500 + +body SPAMPIC_UNKNOWN eval:ocrtext_eval() +describe SPAMPIC_UNKNOWN Failed to read image header +score SPAMPIC_UNKNOWN 2.000 + +body SPAMPIC_NONSTD eval:ocrtext_eval() +describe SPAMPIC_NONSTD Non standard image header +score SPAMPIC_NONSTD 0.200 + +body SPAMPIC_BROKEN eval:ocrtext_eval() +describe SPAMPIC_BROKEN Contains damaged image +score SPAMPIC_BROKEN 1.500 + +body SPAMPIC_ALPHA_1 eval:ocrtext_eval() +describe SPAMPIC_ALPHA_1 Image contains many alphanumeric chars +score SPAMPIC_ALPHA_1 0.500 + +body SPAMPIC_ALPHA_2 eval:ocrtext_eval() +describe SPAMPIC_ALPHA_2 Image contains many alphanumeric chars +score SPAMPIC_ALPHA_2 1.000 + +body SPAMPIC_ALPHA_3 eval:ocrtext_eval() +describe SPAMPIC_ALPHA_3 Image contains many alphanumeric chars +score SPAMPIC_ALPHA_3 1.500 + +body __SPAMPIC_COUNT_2 eval:ocrtext_eval() +body __SPAMPIC_COUNT_3 eval:ocrtext_eval() +body __SPAMPIC_COUNT_4 eval:ocrtext_eval() +body __SPAMPIC_COUNT_5 eval:ocrtext_eval() +body __SPAMPIC_COUNT_6 eval:ocrtext_eval() +body __SPAMPIC_COUNT_7 eval:ocrtext_eval() +rawbody __HAVE_CID /src=["']?cid:/i + +# +# Multiple inline pics without text are very suspicios +# +meta SPAMPIC_MULTI_1 (__SPAMPIC_COUNT_2 + (HTML_IMAGE_ONLY_04 || HTML_IMAGE_ONLY_08 || HTML_IMAGE_ONLY_12 || HTML_IMAGE_ONLY_16 || HTML_IMAGE_ONLY_20 || HTML_IMAGE_ONLY_24 || HTML_IMAGE_ONLY_28 || HTML_IMAGE_ONLY_32) + __HAVE_CID + (IMPPYZOR_CHECK || SPAMPIC_WORDS_1 || SPAMPIC_ALPHA_1 || SPAMPIC_ALPHA_2 || SPAMPIC_ALPHA_3) == 4) +describe SPAMPIC_MULTI_1 Contains inline pics (2) +score SPAMPIC_MULTI_1 1.000 + +meta SPAMPIC_MULTI_2 (__SPAMPIC_COUNT_3 + (HTML_IMAGE_ONLY_04 || HTML_IMAGE_ONLY_08 || HTML_IMAGE_ONLY_12 || HTML_IMAGE_ONLY_16 || HTML_IMAGE_ONLY_20 || HTML_IMAGE_ONLY_24 || HTML_IMAGE_ONLY_28 || HTML_IMAGE_ONLY_32) + __HAVE_CID + (IMPPYZOR_CHECK || SPAMPIC_WORDS_1 || SPAMPIC_ALPHA_1 || SPAMPIC_ALPHA_2 || SPAMPIC_ALPHA_3) == 4) +describe SPAMPIC_MULTI_2 Contains inline pics (3) +score SPAMPIC_MULTI_2 2.000 + +meta SPAMPIC_MULTI_3 (__SPAMPIC_COUNT_4 + (HTML_IMAGE_ONLY_04 || HTML_IMAGE_ONLY_08 || HTML_IMAGE_ONLY_12 || HTML_IMAGE_ONLY_16 || HTML_IMAGE_ONLY_20 || HTML_IMAGE_ONLY_24 || HTML_IMAGE_ONLY_28 || HTML_IMAGE_ONLY_32) + __HAVE_CID + (IMPPYZOR_CHECK || SPAMPIC_WORDS_1 || SPAMPIC_ALPHA_1 || SPAMPIC_ALPHA_2 || SPAMPIC_ALPHA_3) == 4) +describe SPAMPIC_MULTI_3 Contains inline pics (4) +score SPAMPIC_MULTI_3 2.500 + +meta SPAMPIC_MULTI_4 (__SPAMPIC_COUNT_5 + (HTML_IMAGE_ONLY_04 || HTML_IMAGE_ONLY_08 || HTML_IMAGE_ONLY_12 || HTML_IMAGE_ONLY_16 || HTML_IMAGE_ONLY_20 || HTML_IMAGE_ONLY_24 || HTML_IMAGE_ONLY_28 || HTML_IMAGE_ONLY_32) + __HAVE_CID + (IMPPYZOR_CHECK || SPAMPIC_WORDS_1 || SPAMPIC_ALPHA_1 || SPAMPIC_ALPHA_2 || SPAMPIC_ALPHA_3) == 4) +describe SPAMPIC_MULTI_4 Contains inline pics (5) +score SPAMPIC_MULTI_4 3.000 + +meta SPAMPIC_MULTI_5 (__SPAMPIC_COUNT_6 + (HTML_IMAGE_ONLY_04 || HTML_IMAGE_ONLY_08 || HTML_IMAGE_ONLY_12 || HTML_IMAGE_ONLY_16 || HTML_IMAGE_ONLY_20 || HTML_IMAGE_ONLY_24 || HTML_IMAGE_ONLY_28 || HTML_IMAGE_ONLY_32) + __HAVE_CID + (IMPPYZOR_CHECK || SPAMPIC_WORDS_1 || SPAMPIC_ALPHA_1 || SPAMPIC_ALPHA_2 || SPAMPIC_ALPHA_3) == 4) +describe SPAMPIC_MULTI_5 Contains inline pics (6) +score SPAMPIC_MULTI_5 4.000 + +meta SPAMPIC_MULTI_6 (__SPAMPIC_COUNT_7 + (HTML_IMAGE_ONLY_04 || HTML_IMAGE_ONLY_08 || HTML_IMAGE_ONLY_12 || HTML_IMAGE_ONLY_16 || HTML_IMAGE_ONLY_20 || HTML_IMAGE_ONLY_24 || HTML_IMAGE_ONLY_28 || HTML_IMAGE_ONLY_32) + __HAVE_CID + (IMPPYZOR_CHECK || SPAMPIC_WORDS_1 || SPAMPIC_ALPHA_1 || SPAMPIC_ALPHA_2 || SPAMPIC_ALPHA_3) == 4) +describe SPAMPIC_MULTI_6 Contains inline pics (7+) +score SPAMPIC_MULTI_6 5.000 + + + +# +# Summarize the OCR scan results +# +body SPAMPIC_WORDS_1 eval:ocrtext_eval() +describe SPAMPIC_WORDS_1 Contains inline spam picture (1) +score SPAMPIC_WORDS_1 1.500 + +body SPAMPIC_WORDS_2 eval:ocrtext_eval() +describe SPAMPIC_WORDS_2 Contains inline spam picture (2) +score SPAMPIC_WORDS_2 4.000 + +body SPAMPIC_WORDS_3 eval:ocrtext_eval() +describe SPAMPIC_WORDS_3 Contains inline spam picture (3) +score SPAMPIC_WORDS_3 6.000 + +body SPAMPIC_WORDS_4 eval:ocrtext_eval() +describe SPAMPIC_WORDS_4 Contains inline spam picture (4) +score SPAMPIC_WORDS_4 9.000 + +body SPAMPIC_WORDS_5 eval:ocrtext_eval() +describe SPAMPIC_WORDS_5 Contains inline spam picture (5+) +score SPAMPIC_WORDS_5 12.000 --- /dev/null Sat Dec 2 09:49:19 2006 +++ ocrtext.pm Sat Dec 2 09:47:19 2006 @@ -0,0 +1,1170 @@ +=head1 NAME + +Mail::SpamAssassin::Plugin::ocrtext - Check for specific keywords in gif/jpg/png attachments, using gocr. + +=head1 SYNOPSIS + + loadplugin Mail::SpamAssassin::Plugin::ocrtext /path/to/ocrtext.pm + + # Words to scan for + ocrtext_words stock,alert,etc ... + + # Positive words to scan for + ocrtext_pwords + + # Max pics to scan + ocrtext_maxscans 3 + + # Scan timout per pic + ocrtext_timeout 8 + + # Maximum score to still do OCR + ocrtext_dscore 10 + + # Min pixel per kb to to checks + ocrtext_minpixratio_suspect 10000 + + # Min pixel per kb to do OCR + ocrtext_minpixratio_ocr 2000 + + # Min pixels to do OCR + ocrtext_minpixels_ocr 20000 + + # Max size of pic in kb to do OCR + ocrtext_maxsize_ocr 100 + + # Min size of pic in kb to do OCR + ocrtext_minsize_ocr 4 + + # Min size of pic in kb to do anything at all + ocrtext_minsize 1 + + # Limit 1 of chars an OCR scan can have to match + ocrtext_alpha1 32 + + # Limit 2 of chars an OCR scan can have to match + ocrtext_alpha2 100 + + # Limit 3 of chars an OCR scan can have to match + ocrtext_alpha3 400 + + # Path of the gocr binary + gocr_path /usr/local/bin/gocr + + # Path of the pnmtools binaries + pnmtools_path /usr/local/bin + +=head1 DESCRIPTION + +Checks for specific keywords in gif/jpg/png attachments, using gocr. +This can be used to detect spam that puts all the real contect in an +attached image, accompanied with random text and html (no URLs, etc). +There are also various rules to validate attached images and to detect +forged content types or broken images. + +=head1 AUTHOR + +Martin Blapp, mb -at- imp -dot- ch + +=head1 COPYRIGHT + +Copyright (C) 2004-2006 ImproWare AG. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY IMPROWARE INC. AND ITS CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +=cut + +package Mail::SpamAssassin::Plugin::ocrtext; +use strict; +use Mail::SpamAssassin; +use Mail::SpamAssassin::Plugin; +use String::Approx 'adistr'; +use Image::ExifTool; +use Imager; + +our @ISA = qw(Mail::SpamAssassin::Plugin); +sub dbg { Mail::SpamAssassin::dbg (@_); } + +our $threshold = "0.15"; + +sub new { + my ($class, $mailsa, $server) = @_; + $class = ref($class) || $class; + my $self = $class->SUPER::new($mailsa); + bless ($self, $class); + $self->set_config($mailsa->{conf}); + $self->register_eval_rule("ocrtext_check"); + $self->register_eval_rule("ocrtext_eval"); + return $self; +} + +sub set_config { + my($self, $conf) = @_; + my @cmds = (); + + push(@cmds, { + setting => 'ocrtext_maxscans', + is_admin => 1, + default => 3, + type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC + }); + + push(@cmds, { + setting => 'ocrtext_timeout', + is_admin => 1, + default => 8, + type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC + }); + + push(@cmds, { + setting => 'ocrtext_dscore', + is_admin => 1, + default => 10, + type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC + }); + + push(@cmds, { + setting => 'ocrtext_minsize_ocr', + is_admin => 1, + default => 4, + type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC + }); + + push(@cmds, { + setting => 'ocrtext_maxsize_ocr', + is_admin => 1, + default => 100, + type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC + }); + + push(@cmds, { + setting => 'ocrtext_minpixels_ocr', + is_admin => 1, + default => 20000, + type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC + }); + + push(@cmds, { + setting => 'ocrtext_minpixratio_ocr', + is_admin => 1, + default => 2000, + type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC + }); + + push(@cmds, { + setting => 'ocrtext_minpixratio_suspect', + is_admin => 1, + default => 5000, + type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC + }); + + push(@cmds, { + setting => 'ocrtext_minsize', + is_admin => 1, + default => 1, + type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC + }); + + push(@cmds, { + setting => 'ocrtext_alpha1', + is_admin => 1, + default => 32, + type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC + }); + + push(@cmds, { + setting => 'ocrtext_alpha2', + is_admin => 1, + default => 100, + type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC + }); + + push(@cmds, { + setting => 'ocrtext_alpha3', + is_admin => 1, + default => 400, + type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC + }); + + push(@cmds, { + setting => 'ocrtext_words', + is_admin => 1, + default => undef, + type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING + }); + + push(@cmds, { + setting => 'ocrtext_pwords', + is_admin => 1, + default => undef, + type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING + }); + + push (@cmds, { + setting => 'pnmtools_path', + is_admin => 1, + default => undef, + code => sub { + my ($self, $key, $value, $line) = @_; + if (!defined $value || !length $value) { + return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE; + } + $value = Mail::SpamAssassin::Util::untaint_file_path($value); + if (!-d $value) { + dbg("config: pnmtools_path \"$value\" isn't an directory"); + return $Mail::SpamAssassin::Conf::INVALID_VALUE; + } + $self->{pnmtools_path} = $value; + } + }); + + push (@cmds, { + setting => 'gocr_path', + is_admin => 1, + default => undef, + code => sub { + my ($self, $key, $value, $line) = @_; + if (!defined $value || !length $value) { + return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE; + } + $value = Mail::SpamAssassin::Util::untaint_file_path($value); + if (!-x $value) { + dbg("config: gocr_path \"$value\" isn't an executable"); + return $Mail::SpamAssassin::Conf::INVALID_VALUE; + } + $self->{gocr_path} = $value; + } + }); + + $conf->{parser}->register_commands(\@cmds); +} + +sub ocrtext_eval { + return 0; +} + +sub ocrtext_check { + my ($self, $pms) = @_; + my $partcount = 0; + my @ocrtext; + my $pnmtools_path = $self->{main}->{conf}->{pnmtools_path}; + my $giffix = "$pnmtools_path/giffix"; + my $pid0; + + my $maxscans = $pms->{main}->{conf}->{ocrtext_maxscans}; + my $dscore = $pms->{main}->{conf}->{ocrtext_dscore}; + my $imagecount = 0; + my $imagetcount = 0; + + my $cscore = $pms->get_score(); + dbg("ocrtext: score is $cscore"); + if ( $cscore > $dscore ) { + dbg("ocrtext: Skip OCR scan, message has already $cscore points of needed $dscore points."); + return 0; + } + + foreach my $p ( $pms->{msg}->find_parts("image") ) { + + $imagetcount++; + + # + # Only scan images up to $maxscans images. + # + if ($imagecount >= $maxscans) { + next; + } + + my ( $ctype, $boundary, $charset, $name ) = + Mail::SpamAssassin::Util::parse_content_type( + $p->get_header('content-type')); + + dbg("ocrtext: findparts() found possible $ctype image"); + + my ($tmpfpath, $tmpf) = Mail::SpamAssassin::Util::secure_tmpfile(); + dbg("ocrtext: created tempfile $tmpfpath"); + + my $picture_header = ""; + my $gotheader = 0; + foreach my $out ($p->decode()) { + if ($gotheader < 10) { + $picture_header .= $out; + $gotheader ++; + } + print $tmpf $out; + } + close ($tmpf); + dbg("ocrtext: saved image as $tmpfpath"); + + my $filesize = (stat($tmpfpath))[7]; + my $minsize = 1024 * $pms->{main}->{conf}->{ocrtext_minsize}; + if ($filesize <= $minsize) { + dbg("ocrtext: Skip pic, size $filesize is smaller than $minsize KB"); + unlink $tmpfpath; + next; + } + + my $exifTool = new Image::ExifTool; + my %opts; + + $exifTool->Options(Unknown => 1); + $exifTool->Options(Verbose => 0); + my $success = $exifTool->ExtractInfo($tmpfpath, %opts); + + my $info = $exifTool->GetInfo('FileType', 'FileSize', 'ImageWidth', 'ImageHeight'); + if (!$success) { + if (! $$info{'FileType'} && ! $$info{'ImageHeight'} && ! $$info{'ImageWidth'}) { + my $success = 0; + + if ($ctype eq "image/png" || $ctype eq "image/jpeg") { + my $hitdesc = "SPAMPIC_BROKEN"; + $pms->_handle_hit($hitdesc, + $pms->{conf}->{scores}->{$hitdesc}, "BODY: ", + $pms->{conf}->{descriptions}->{$hitdesc} + ); + $pms->{tests_already_hit}->{$hitdesc} = 1; + dbg("ocrtext: broken pic found, exifTool->ExtractInfo failed"); + } elsif ($ctype eq "image/gif" || substr($picture_header,0,3) eq "\x47\x49\x46") { + if ($pnmtools_path ne "" && -x $giffix) { + # + # If we got a broken gif, try to fix it. + # + dbg("ocrtext: broken pic found, try to fix it"); + my $tmpoutput = ""; + my $tmpfpathfixed = $tmpfpath . ".fixed"; + + $pid0 = Mail::SpamAssassin::Util::helper_app_pipe_open(*PID0, + $tmpfpath, 1, "$giffix -q > $tmpfpathfixed"); + my @response = ; + close PID0; + while (my $v = shift @response) { + $tmpoutput .= $v; + } + quit_stale_helper(*PID0, $pid0, 0); + unlink $tmpfpath; + $tmpfpath = $tmpfpathfixed; + # Try again ... + $success = $exifTool->ExtractInfo($tmpfpath, %opts); + } + } + if (!$success) { + my $hitdesc = "SPAMPIC_BROKEN"; + $pms->_handle_hit($hitdesc, + $pms->{conf}->{scores}->{$hitdesc}, "BODY: ", + $pms->{conf}->{descriptions}->{$hitdesc} + ); + $pms->{tests_already_hit}->{$hitdesc} = 1; + dbg("ocrtext: broken pic found, exifTool->ExtractInfo failed"); + dbg("ocrtext: could not extract picture info"); + unlink $tmpfpath; + next; + } + } else { + my $rtype = $$info{'FileType'}; + if ($rtype eq "GIF" || $rtype eq "PNG" || $rtype eq "JPEG") { + my $hitdesc = "SPAMPIC_NONSTD"; + $pms->_handle_hit($hitdesc, + $pms->{conf}->{scores}->{$hitdesc}, "BODY: ", + $pms->{conf}->{descriptions}->{$hitdesc} + ); + $pms->{tests_already_hit}->{$hitdesc} = 1; + dbg("ocrtext: non standard $rtype ($ctype) pic found, exifTool->ExtractInfo partly failed"); + } else { + dbg("ocrtext: non standard $rtype ($ctype) pic found, exifTool->ExtractInfo partly failed"); + } + } + } + + my $rtype = $$info{'FileType'}; + + if (($ctype eq "image/gif" && $rtype ne "GIF") || ($ctype eq "image/png" && $rtype ne "PNG") || ($ctype eq "image/jpeg" && $rtype ne "JPEG")) { + dbg("ocrtext: wrong content type, picture is not a $ctype picture"); + my $hitdesc = "SPAMPIC_FORGED_CT"; + $pms->_handle_hit($hitdesc, + $pms->{conf}->{scores}->{$hitdesc}, "BODY: ", + $pms->{conf}->{descriptions}->{$hitdesc} + ); + $pms->{tests_already_hit}->{$hitdesc} = 1; + } + + if ($rtype eq "GIF" || $rtype eq "PNG" || $rtype eq "JPEG") { + # + # Deal with Kb and bytes + # + my $size = $$info{'FileSize'}; + my $bytes; + if ($size =~ /kB/i) { + $size =~ s/[a-zA-Z]+//g; + $bytes = $size * 1024; + } elsif ($size =~ /bytes/i) { + $size =~ s/[a-zA-Z]+//g; + $bytes = $size; + } else { + $bytes = $size; + } + my $height = $$info{'ImageHeight'}; + my $width = $$info{'ImageWidth'}; + my $pixels = $height * $width; + my $pixratio = ($height * $width) / ($bytes / 1024) ; + + dbg("ocrtext: found $rtype image: size=$bytes, height=$height, width=$width"); + + my $minpix = $pms->{main}->{conf}->{ocrtext_minpixels_ocr}; + my $minsize = 1024 * $pms->{main}->{conf}->{ocrtext_minsize_ocr}; + my $maxsize = 1024 * $pms->{main}->{conf}->{ocrtext_maxsize_ocr}; + my $pixratio_ocr = $pms->{main}->{conf}->{ocrtext_minpixratio_ocr}; + my $pixratio_suspect = $pms->{main}->{conf}->{ocrtext_minpixratio_suspect}; + + if ($bytes < $minsize) { + dbg("ocrtext: skip picture, size $bytes too small, needed $minsize"); + } elsif ($bytes > $maxsize) { + dbg("ocrtext: skip picture, size $bytes too big, needed $maxsize"); + } elsif ($pixels < $minpix ) { + dbg("ocrtext: skip picture, to few pixels: $pixels, needed $minpix"); + } elsif ($pixratio < $pixratio_ocr) { + dbg("ocrtext: skip picture, pixel/size ratio $pixratio too small, needed $pixratio_ocr"); + } else { + # + # First check if the pixel/size ratio is suspect. This + # should give a small amount of SA hits, even if no + # suspect words are detected. + # + if ($pixratio > $pixratio_suspect) { + my $hitdesc = "SPAMPIC_SUSPECT"; + dbg("ocrtext: SUSPECT $ctype ($rtype) PICTURE FOUND"); + $pms->_handle_hit($hitdesc, + $pms->{conf}->{scores}->{$hitdesc}, "BODY: ", + $pms->{conf}->{descriptions}->{$hitdesc} . "\n$ctype ($rtype)" + ); + $pms->{tests_already_hit}->{$hitdesc} = 1; + } + + # + # Our picture matches the size requirements, now do text checks. + # Redirect any gocr errors to /dev/null + # + my $gocr_path = $self->{main}->{conf}->{gocr_path}; + my $gocr = "$gocr_path -v 0 -e /dev/null -i -"; + my $output = ""; + my $exitval = 0; + + my $pnminvert = "$pnmtools_path/pnminvert -quiet 2>&1"; + my $pnmnorm = "$pnmtools_path/pnmnorm -quiet 2>&1"; + my $pnmgamma = "$pnmtools_path/pnmgamma -quiet 2>&1"; + my $giftopnm = "$pnmtools_path/giftopnm -quiet"; + my $giftopnmall = "$pnmtools_path/giftopnm -image=all"; + my $jpegtopnm = "$pnmtools_path/jpegtopnm -quiet"; + my $pngtopnm = "$pnmtools_path/pngtopnm -quiet"; + my $djpeg = "$pnmtools_path/djpeg"; + + my $giftopnmexe = "$pnmtools_path/giftopnm"; + my $jpegtopnmexe = "$pnmtools_path/jpegtopnm"; + my $pngtopnmexe = "$pnmtools_path/pngtopnm"; + + my $pid1; + my $pid2; + my $pid3; + my $pid4; + my $pid5; + my $pid6; + my $pid7; + my $pid8; + my $pid9; + my $pid10; + my $pid11; + my $pid12; + + # + # Limit the scantime + # + $pms->enter_helper_run_mode(); + my $timer = Mail::SpamAssassin::Timeout->new({ secs => $self->{main}->{conf}->{ocrtext_timeout} }); + my $err = $timer->run_and_catch(sub { + + local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" }; + + if ($rtype eq "GIF") { + $imagecount++; + if ($pnmtools_path ne "" && -x $giftopnmexe) { + my $tmpoutput = ""; + # + # Now check how many frames we got + # + my $frame = 0; + my $img_num = 0; + my $img_delay = 0; + my @imgs = Imager->new; + my @imgs_back; + @imgs= Imager->read_multi(file => $tmpfpath, type=>'gif'); + my $img_loc_count = 0; + my $has_img_loop = 0; + my $has_img_disposal = 0; + my $need_cons = 0; + my $number = $#imgs + 1; + if ($#imgs > 0) { + # + # We've got an animated gif + # + + my $img_top_prev; + for(my $i = 0; $i <= $#imgs; $i++) { + my $img_delay_new = $imgs[$i]->tags(name => "gif_delay"); + my $img_top = $imgs[$i]->tags(name => "gif_top"); + my $img_left = $imgs[$i]->tags(name => "gif_left"); + my $img_loop = $imgs[$i]->tags(name => "gif_loop"); + my $img_loop_count = $imgs[$i]->tags(name => "gif_loop_count"); + my $colors = $imgs[$i]->getcolorcount(); + my $framecount = $i + 1; + my $img_disposal = $imgs[$i]->tags(name => "gif_disposal"); + if ($img_top && $img_top != $img_top_prev && !$img_left && $img_disposal) { + $img_loc_count++; + } + $img_top_prev = $img_top; + if ($img_delay_new > $img_delay && $img_delay <= 1000) { + $img_num = $i; + $img_delay = $img_delay_new; + } + + if (!$img_loop) { + $img_loop = "NA"; + } + if ($img_loop eq "0" || $img_loop ne "NA" && $img_loop > 0) { + $has_img_loop = 1; + } else { + $img_loop = "NA"; + } + if ($img_disposal) { + $has_img_disposal = 1; + } + dbg("ocrtext: GIF ANIM frame $i has $img_delay delay, disp=$img_disposal, top=$img_top, left=$img_left, loop=$img_loop"); + + if (!$need_cons) { + dbg("ocrtext: push frame $i into \@imgs_back"); + push(@imgs_back, $imgs[$i]); + } + if (!$need_cons && $img_delay > 1000) { + $need_cons = 1; + } + } + # + # off by one between Imager and netpbm + # + $frame = $img_num; + $frame++; + dbg("ocrtext: Imager thinks spam frame may be nr. $frame"); + } else { + dbg("ocrtext: Only one frame found, skip imager part."); + $frame = 1; + } + if ($number > 1 && $frame == 1 || $number > 1 && $number == $frame) { + my $dbgtext = "with $number frames"; + dbg("ocrtext: SUSPECT GIF ANIM $dbgtext"); + my $hitdesc = "GIFANIM_SUSPECT"; + $pms->_handle_hit($hitdesc, + $pms->{conf}->{scores}->{$hitdesc}, "BODY: ", + $pms->{conf}->{descriptions}->{$hitdesc} + ); + $pms->{tests_already_hit}->{$hitdesc} = 1; + $need_cons = 2; + } elsif ($img_loc_count && $frame > 1 || (!$has_img_loop && $number > 1 && $has_img_disposal)) { + my $dbgtext = "with $number moving frames"; + dbg("ocrtext: SUSPECT GIF ANIM $dbgtext"); + my $hitdesc = "GIFANIM_SUSPECT"; + $pms->_handle_hit($hitdesc, + $pms->{conf}->{scores}->{$hitdesc}, "BODY: ", + $pms->{conf}->{descriptions}->{$hitdesc} + ); + $pms->{tests_already_hit}->{$hitdesc} = 1; + $need_cons = 1; + } + my @gifpics; + push(@gifpics, $tmpfpath); + + # + # Create a consolidated pic if neccessary. this tries to workaround various tricks + # spammers use :-) + # + if ($need_cons) { + my $mypath = $tmpfpath . ".cons"; + my $img = Imager->new; + + dbg("ocrtext: Consolidate picture"); + if ($need_cons == 1) { + dbg("ocrtext: Save pics up to the offending frame"); + my @imgs_temp = Imager->new; + Imager->write_multi({ file=> $mypath, makemap=>'webmap', type => 'gif'}, @imgs_back); + + dbg("ocrtext: Write pic with limited frames"); + $img->read(file=>$mypath, gif_consolidate=>1) + or die $img->errstr; + } else { + $img->read(file=>$tmpfpath, gif_consolidate=>1) + or die $img->errstr; + } + + dbg("ocrtext: Write flat pic with consolidated frames"); + $mypath = $tmpfpath . ".cons"; + $img->write(file=>$mypath, type => 'gif') + or die $img->errstr; + + push(@gifpics, $mypath); + } + + # + # Now proceed with the gif(s) + # + + my $y = 0; + foreach $tmpfpath (@gifpics) { + { + + if ($y) { + $frame = 1; + } + dbg("ocrtext: Scan $frame of $tmpfpath"); + + $tmpoutput = ""; + $exitval = 0; + $y++; + + $pid1 = Mail::SpamAssassin::Util::helper_app_pipe_open(*PID1, + $tmpfpath, 1, "$giffix | $giftopnm -image=$frame 2>&1 | $gocr"); + my @response = ; + close PID1; + while (my $v = shift @response) { + $tmpoutput .= $v; + } + $exitval = quit_stale_helper(*PID1, $pid1, $exitval); + } + + if ($exitval != 0) { + # + # Only return a bad value if giftopnm or giffix have failed + # + $tmpoutput = ""; + $exitval = 0; + { + $pid2 = Mail::SpamAssassin::Util::helper_app_pipe_open(*PID2, + $tmpfpath, 1, "$giffix | $giftopnm -image=$frame 2>&1"); + my @response = ; + close PID2; + $exitval = quit_stale_helper(*PID2, $pid2, $exitval); + } + if ($exitval != 0) { + dbg("ocrtext: broken gif pic found"); + my $hitdesc = "SPAMPIC_BROKEN"; + $pms->_handle_hit($hitdesc, + $pms->{conf}->{scores}->{$hitdesc}, "BODY: ", + $pms->{conf}->{descriptions}->{$hitdesc} + ); + $pms->{tests_already_hit}->{$hitdesc} = 1; + } + } else { + $output .= $tmpoutput; + # + # Second try, work on a normalized image. + # + $tmpoutput = ""; + $exitval = 0; + { + $pid2 = Mail::SpamAssassin::Util::helper_app_pipe_open(*PID2, + $tmpfpath, 1, "$giffix | $giftopnm -image=$frame 2>&1 | $pnmnorm | $gocr"); + my @response = ; + close PID2; + while (my $v = shift @response) { + $tmpoutput .= $v; + } + $exitval = quit_stale_helper(*PID2, $pid2, $exitval); + } + if (! $exitval) { + $output .= "---new-page---"; + $output .= $tmpoutput; + } + + # + # Next step: use gamma correction + # + $tmpoutput = ""; + $exitval = 0; + { + $pid3 = Mail::SpamAssassin::Util::helper_app_pipe_open(*PID3, + $tmpfpath, 1, "$giffix | $giftopnm -image=$frame 2>&1 | $pnmgamma | $gocr"); + my @response = ; + close PID3; + while (my $v = shift @response) { + $tmpoutput .= $v; + } + $exitval = quit_stale_helper(*PID3, $pid3, $exitval); + } + if (! $exitval) { + $output .= "---new-page---"; + $output .= $tmpoutput; + } + + + # + # Last try, invert the picture and normalize it. + # + $tmpoutput = ""; + $exitval = 0; + { + $pid4 = Mail::SpamAssassin::Util::helper_app_pipe_open(*PID4, + $tmpfpath, 1, "$giffix | $giftopnm -image=$frame 2>&1 | $pnminvert | $pnmnorm | $gocr"); + my @response = ; + close PID4; + while (my $v = shift @response) { + $tmpoutput .= $v; + } + $exitval = quit_stale_helper(*PID4, $pid4, $exitval); + } + if (! $exitval) { + $output .= "---new-page---"; + $output .= $tmpoutput; + } + } + } + } + } elsif ($rtype eq "JPEG") { + $imagecount++; + if ($pnmtools_path ne "" && -x $jpegtopnmexe) { + my $tmpoutput = ""; + $exitval = 0; + # + # First try, just scan the normalized pic + # + { + my $pid5 = Mail::SpamAssassin::Util::helper_app_pipe_open(*PID5, + $tmpfpath, 1, "$jpegtopnm 2>&1 | $gocr"); + my @response = ; + close PID5; + while (my $v = shift @response) { + $tmpoutput .= $v; + } + $exitval = quit_stale_helper(*PID5, $pid5, $exitval); + } + if ($exitval != 0) { + # + # Only return a bad value if jpegtopnm failed + # + $tmpoutput = ""; + $exitval = 0; + { + $pid6 = Mail::SpamAssassin::Util::helper_app_pipe_open(*PID6, + $tmpfpath, 1, "$jpegtopnm 2>&1"); + my @response = ; + close PID6; + while (my $v = shift @response) { + $tmpoutput .= $v; + } + $exitval = quit_stale_helper(*PID6, $pid6, $exitval); + } + if ($exitval != 0 && $tmpoutput !~ /End-of-file/) { + dbg("ocrtext: broken jpeg pic found"); + my $hitdesc = "SPAMPIC_BROKEN"; + $pms->_handle_hit($hitdesc, + $pms->{conf}->{scores}->{$hitdesc}, "BODY: ", + $pms->{conf}->{descriptions}->{$hitdesc} + ); + $pms->{tests_already_hit}->{$hitdesc} = 1; + } + } else { + $output .= $tmpoutput; + # + # Second try, normalize the pic + # + $tmpoutput = ""; + $exitval = 0; + { + my $pid6 = Mail::SpamAssassin::Util::helper_app_pipe_open(*PID6, + $tmpfpath, 1, "$jpegtopnm 2>&1 | $pnmnorm | $gocr"); + my @response = ; + close PID6; + while (my $v = shift @response) { + $tmpoutput .= $v; + } + $exitval = quit_stale_helper(*PID6, $pid6, $exitval); + } + if (! $exitval) { + $output .= "---new-page---"; + $output .= $tmpoutput; + } + + # + # Third try, limit the colors, disable dither + # and use a grayscale pic only. + # + $tmpoutput = ""; + $exitval = 0; + { + my $pid7 = Mail::SpamAssassin::Util::helper_app_pipe_open(*PID7, + $tmpfpath, 1, "$djpeg -gray -colors 8 -dither none -pnm 2>&1 | $gocr"); + my @response = ; + close PID7; + while (my $v = shift @response) { + $tmpoutput .= $v; + } + $exitval = quit_stale_helper(*PID7, $pid7, $exitval); + } + if (! $exitval) { + $output .= "---new-page---"; + $output .= $tmpoutput; + } + + # + # Forth try, limit the colors to 8 and invert the pic. + # + $tmpoutput = ""; + $exitval = 0; + { + my $pid8 = Mail::SpamAssassin::Util::helper_app_pipe_open(*PID8, + $tmpfpath, 1, "$djpeg -colors 8 -pnm 2>&1 | $pnminvert | $gocr"); + my @response = ; + close PID8; + while (my $v = shift @response) { + $tmpoutput .= $v; + } + $exitval = quit_stale_helper(*PID8, $pid8, $exitval); + } + if (! $exitval) { + $output .= "---new-page---"; + $output .= $tmpoutput; + } + + # + # Last try, invert the picture and normalize it. + # + $tmpoutput = ""; + $exitval = 0; + { + my $pid9 = Mail::SpamAssassin::Util::helper_app_pipe_open(*PID9, + $tmpfpath, 1, "$jpegtopnm 2>&1 | $pnminvert | $pnmnorm | $gocr"); + my @response = ; + close PID9; + while (my $v = shift @response) { + $tmpoutput .= $v; + } + $exitval = quit_stale_helper(*PID9, $pid9, $exitval); + } + if (! $exitval) { + $output .= "---new-page---"; + $output .= $tmpoutput; + } + } + } + } elsif ($rtype eq "PNG") { + $imagecount++; + if ($pnmtools_path ne "" && -x $pngtopnmexe) { + my $tmpoutput = ""; + { + my $pid10 = Mail::SpamAssassin::Util::helper_app_pipe_open(*PID10, + $tmpfpath, 1, "$pngtopnm 2>&1 | $gocr"); + my @response = ; + close PID10; + while (my $v = shift @response) { + $tmpoutput .= $v; + } + $exitval = quit_stale_helper(*PID10, $pid10, $exitval); + } + if ($exitval != 0) { + # + # Only return a bad value if pngtopnm failed + # + $tmpoutput = ""; + $exitval = 0; + { + $pid11 = Mail::SpamAssassin::Util::helper_app_pipe_open(*PID11, + $tmpfpath, 1, "$pngtopnm 2>&1"); + my @response = ; + close PID11; + $exitval = quit_stale_helper(*PID11, $pid11, $exitval); + } + if ($exitval != 0) { + dbg("ocrtext: broken png pic found"); + my $hitdesc = "SPAMPIC_BROKEN"; + $pms->_handle_hit($hitdesc, + $pms->{conf}->{scores}->{$hitdesc}, "BODY: ", + $pms->{conf}->{descriptions}->{$hitdesc} + ); + $pms->{tests_already_hit}->{$hitdesc} = 1; + } + } else { + $output .= $tmpoutput; + # + # Second try, work on a normalized image. + # + $tmpoutput = ""; + $exitval = 0; + { + my $pid11 = Mail::SpamAssassin::Util::helper_app_pipe_open(*PID11, + $tmpfpath, 1, "$giffix | $pngtopnm 2>&1 | $pnmnorm | $gocr"); + my @response = ; + close PID11; + while (my $v = shift @response) { + $tmpoutput .= $v; + } + $exitval = quit_stale_helper(*PID11, $pid11, $exitval); + } + if (! $exitval) { + $output .= "---new-page---"; + $output .= $tmpoutput; + } + + # + # Last try, invert the picture and normalize it. + # + $tmpoutput = ""; + $exitval = 0; + { + my $pid12 = Mail::SpamAssassin::Util::helper_app_pipe_open(*PID12, + $tmpfpath, 1, "$giffix | $giftopnm 2>&1 | pnminvert | $pnmnorm | $gocr"); + my @response = ; + close PID12; + while (my $v = shift @response) { + $tmpoutput .= $v; + } + $exitval = quit_stale_helper(*PID12, $pid12, $exitval); + } + if (! $exitval) { + $output .= "---new-page---"; + $output .= $tmpoutput; + } + } + } + } + + }); + $pms->leave_helper_run_mode(); + + # + # Kill and close any open helpers. + # + quit_stale_helper(*PID0, $pid0, 0); + if ($rtype eq "GIF") { + quit_stale_helper(*PID1, $pid1, 0); + quit_stale_helper(*PID2, $pid2, 0); + quit_stale_helper(*PID3, $pid3, 0); + quit_stale_helper(*PID4, $pid4, 0); + } elsif ($rtype eq "JPEG") { + quit_stale_helper(*PID5, $pid5, 0); + quit_stale_helper(*PID6, $pid6, 0); + quit_stale_helper(*PID7, $pid7, 0); + quit_stale_helper(*PID8, $pid8, 0); + quit_stale_helper(*PID9, $pid9, 0); + } elsif ($rtype eq "PNG") { + quit_stale_helper(*PID10, $pid10, 0); + quit_stale_helper(*PID11, $pid11, 0); + quit_stale_helper(*PID12, $pid12, 0); + } + + my @words = split(/;/,$pms->{main}->{conf}->{ocrtext_words}); + my @pwords = split(/;/,$pms->{main}->{conf}->{ocrtext_pwords}); + + my $cnt = 0; + my $tmpoutput = $output; + $tmpoutput =~ s/---new-page---//g; + $tmpoutput =~ tr/!;|081/iiioal/; + $tmpoutput =~ s/[^a-zA-Z0-9\:]//g; + $tmpoutput = lc $tmpoutput; + # + # Negative words, each of them gives +1 to cnt + # + foreach my $w (@words) { + my $wthreshold; + if ($w =~ /^(.*?)::(0(\.\d+){0,1})/) { + ($w, $wthreshold) = ($1, $2); + } else { + $wthreshold = $threshold; + } + $w =~ s/[^a-zA-Z0-9]//g; + $w = lc $w; + + my $rw = $w; + $rw =~ s/[\:il1¦]/\[il1\:\]/; + $rw =~ s/[a8\@]/\[a8\@\]/; + $rw =~ s/[o0]/\[o0\]/; + $rw =~ s/[rn]/\[rn\]/; + $rw =~ s/[mw]/\[mw\]/; + $rw =~ s/[s5]/\[s5\]/; + if ($tmpoutput =~ /($rw)/) { + dbg("ocrtext: found word \"$w\" as regex match in string \"$1\""); + $cnt++; + } else { + if ($tmpoutput && $tmpoutput ne "") { + $_ = lc; + my $matched = adistr( $w, $tmpoutput); + if ($matched && $matched ne "" && abs($matched) < $wthreshold ) { + $cnt++; + dbg("ocrtext: found word \"$w\" with fuzz of " . abs($matched)); + } + } + } + } + # + # Positive words, each of them gives +1 to cnt + # + foreach my $w (@pwords) { + my $wthreshold; + if ($w =~ /^(.*?)::(0(\.\d+){0,1})/) { + ($w, $wthreshold) = ($1, $2); + } else { + $wthreshold = $threshold; + } + $w =~ s/[^a-zA-Z0-9]//g; + $w = lc $w; + + my $rw = $w; + $rw =~ s/[\:il1¦]/\[il1\:\]/; + $rw =~ s/[a8\@]/\[a8\@\]/; + $rw =~ s/[o0]/\[o0\]/; + $rw =~ s/[rn]/\[rn\]/; + $rw =~ s/[mw]/\[mw\]/; + $rw =~ s/[s5]/\[s5\]/; + if ($tmpoutput =~ /($rw)/) { + dbg("ocrtext: found word \"$w\" as regex match in string \"$1\""); + $cnt--; + } else { + if ($tmpoutput && $tmpoutput ne "") { + $_ = lc; + my $matched = adistr( $w, $tmpoutput); + if ($matched && $matched ne "" && abs($matched) < $wthreshold ) { + $cnt--; + dbg("ocrtext: found word \"$w\" with fuzz of " . abs($matched)); + } + } + } + } + if ($cnt >= 1) { + dbg("ocrtext: found $cnt words in picture"); + my $hitdesc; + if ($cnt == 1) { + $hitdesc = "SPAMPIC_WORDS_1"; + } elsif ($cnt == 2) { + $hitdesc = "SPAMPIC_WORDS_2"; + } elsif ($cnt == 3) { + $hitdesc = "SPAMPIC_WORDS_3"; + } elsif ($cnt == 4) { + $hitdesc = "SPAMPIC_WORDS_4"; + } elsif ($cnt > 4) { + $hitdesc = "SPAMPIC_WORDS_5"; + } + $pms->_handle_hit($hitdesc, + $pms->{conf}->{scores}->{$hitdesc}, "BODY: ", + $pms->{conf}->{descriptions}->{$hitdesc} + ); + $pms->{tests_already_hit}->{$hitdesc} = 1; + } + + my $cnt_pages = 1; + while ($output =~ /---new-page---/g) { $cnt_pages++ } + if ($output ne "" && $output !~ /^P6/) { + # + # Remove spaces and points. + # + my $tmpoutput = $output; + $output =~ s/[\s\t ]+//g; + $output =~ s/\.//g; + dbg("ocrtext: : found output $output"); + push @ocrtext, $output; + + # + # Try to sort out patterns and equal chars. Numbers + # are currently not counted at all. S and s are skipped. + # + $tmpoutput =~ s/---new-page---//; + $tmpoutput =~ s/[^a-z0-9]//; + my @chars = ('a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','t','u','v','w','x','y','z'); + foreach my $char (@chars) { + $tmpoutput =~ s/$char{2,}/$char/ig; + } + my $cnt_alpha = $tmpoutput =~ tr/a-zA-Z//; + my $cnt_digit = $tmpoutput =~ tr/0-9//; + if ($cnt_alpha) { $cnt_alpha = $cnt_alpha / $cnt_pages }; + if ($cnt_digit) { $cnt_digit = $cnt_digit / $cnt_pages }; + + dbg("ocrtext: found $cnt_alpha chars and $cnt_digit digits from ocr output"); + + my $ocrtext_alpha1 = $pms->{main}->{conf}->{ocrtext_alpha1}; + my $ocrtext_alpha2 = $pms->{main}->{conf}->{ocrtext_alpha2}; + my $ocrtext_alpha3 = $pms->{main}->{conf}->{ocrtext_alpha3}; + + if ($cnt_alpha > $ocrtext_alpha1) { + my $hitdesc; + if ($cnt_alpha < $ocrtext_alpha2) { + $hitdesc = "SPAMPIC_ALPHA_1"; + } elsif ($cnt_alpha >= $ocrtext_alpha2 && $cnt_alpha < $ocrtext_alpha3) { + $hitdesc = "SPAMPIC_ALPHA_2"; + } elsif ($cnt_alpha >= $ocrtext_alpha3) { + $hitdesc = "SPAMPIC_ALPHA_3"; + } + $pms->_handle_hit($hitdesc, + $pms->{conf}->{scores}->{$hitdesc}, "BODY: ", + $pms->{conf}->{descriptions}->{$hitdesc} + ); + $pms->{tests_already_hit}->{$hitdesc} = 1; + } + } + } + } + unlink $tmpfpath; + unlink $tmpfpath . ".cons"; + } + dbg("ocrtext: Imagecount is $imagetcount"); + push @ocrtext, "OCRTEXT: Imagecount is $imagetcount"; + if ($imagetcount > 1) { + my $hitdesc; + if ($imagetcount == 2) { + $hitdesc = "__SPAMPIC_COUNT_2"; + } elsif ($imagetcount == 3) { + $hitdesc = "__SPAMPIC_COUNT_3"; + } elsif ($imagetcount == 4) { + $hitdesc = "__SPAMPIC_COUNT_4"; + } elsif ($imagetcount == 5) { + $hitdesc = "__SPAMPIC_COUNT_5"; + } elsif ($imagetcount == 6) { + $hitdesc = "__SPAMPIC_COUNT_6"; + } elsif ($imagetcount >= 7) { + $hitdesc = "__SPAMPIC_COUNT_7"; + } + $pms->_handle_hit($hitdesc, + $pms->{conf}->{scores}->{$hitdesc}, "BODY: ", + $pms->{conf}->{descriptions}->{$hitdesc} + ); + $pms->{tests_already_hit}->{$hitdesc} = 1; + } + + return 0; +} + +sub quit_stale_helper { + my ($PID, $pid, $exitval) = @_; + + if (defined(fileno(*PID))) { + if ($pid) { + kill('KILL',$pid); + dbg("ocrtext: killed stale helper [$pid]"); + } + close PID; + $exitval = $?; + dbg("ocrtext: [$pid] returned $exitval"); + } + return $exitval; +} + +1;