/[Apache-SVN]/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/MIMEEval.pm
ViewVC logotype

Contents of /spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/MIMEEval.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 789295 - (show annotations)
Mon Jun 29 11:45:12 2009 UTC (4 months, 3 weeks ago) by jm
File size: 14675 byte(s)
bug 5553: MIME_BASE64_TEXT not properly handling MIME charset rules, causing FPs.  fix by Paul Fisher <pnfisher /at/ berkeley.edu>
1 # <@LICENSE>
2 # Licensed to the Apache Software Foundation (ASF) under one or more
3 # contributor license agreements. See the NOTICE file distributed with
4 # this work for additional information regarding copyright ownership.
5 # The ASF licenses this file to you under the Apache License, Version 2.0
6 # (the "License"); you may not use this file except in compliance with
7 # the License. You may obtain a copy of the License at:
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 # </@LICENSE>
17
18 package Mail::SpamAssassin::Plugin::MIMEEval;
19
20 use strict;
21 use warnings;
22 use bytes;
23 use re 'taint';
24
25 use Mail::SpamAssassin::Plugin;
26 use Mail::SpamAssassin::Locales;
27 use Mail::SpamAssassin::Constants qw(:sa CHARSETS_LIKELY_TO_FP_AS_CAPS);
28 use Mail::SpamAssassin::Util qw(untaint_var);
29
30 use vars qw(@ISA);
31 @ISA = qw(Mail::SpamAssassin::Plugin);
32
33 # constructor: register the eval rule
34 sub new {
35 my $class = shift;
36 my $mailsaobject = shift;
37
38 # some boilerplate...
39 $class = ref($class) || $class;
40 my $self = $class->SUPER::new($mailsaobject);
41 bless ($self, $class);
42
43 # the important bit!
44 $self->register_eval_rule("check_for_mime");
45 $self->register_eval_rule("check_for_mime_html");
46 $self->register_eval_rule("check_for_mime_html_only");
47 $self->register_eval_rule("check_mime_multipart_ratio");
48 $self->register_eval_rule("check_msg_parse_flags");
49 $self->register_eval_rule("check_for_faraway_charset");
50 $self->register_eval_rule("check_for_uppercase");
51 $self->register_eval_rule("check_ma_non_text");
52 $self->register_eval_rule("check_base64_length");
53
54 return $self;
55 }
56
57 ###########################################################################
58
59 sub are_more_high_bits_set {
60 my ($self, $str) = @_;
61
62 # TODO: I suspect a tr// trick may be faster here
63 my $numhis = () = ($str =~ /[\200-\377]/g);
64 my $numlos = length($str) - $numhis;
65
66 ($numlos <= $numhis && $numhis > 3);
67 }
68
69 sub check_for_faraway_charset {
70 my ($self, $pms, $body) = @_;
71
72 my $type = $pms->get('Content-Type',undef);
73
74 my @locales = Mail::SpamAssassin::Util::get_my_locales($self->{main}->{conf}->{ok_locales});
75
76 return 0 if grep { $_ eq "all" } @locales;
77
78 $type = get_charset_from_ct_line($type) if defined $type;
79
80 if (defined $type &&
81 !Mail::SpamAssassin::Locales::is_charset_ok_for_locales
82 ($type, @locales))
83 {
84 # sanity check. Some charsets (e.g. koi8-r) include the ASCII
85 # 7-bit charset as well, so make sure we actually have a high
86 # number of 8-bit chars in the body text first.
87
88 $body = join("\n", @$body);
89 if ($self->are_more_high_bits_set ($body)) {
90 return 1;
91 }
92 }
93
94 0;
95 }
96
97 sub check_for_mime {
98 my ($self, $pms, undef, $test) = @_;
99
100 $self->_check_attachments($pms) unless exists $pms->{$test};
101 return $pms->{$test};
102 }
103
104 # any text/html MIME part
105 sub check_for_mime_html {
106 my ($self, $pms) = @_;
107
108 my $ctype = $pms->get('Content-Type');
109 return 1 if $ctype =~ m{^text/html}i;
110
111 $self->_check_attachments($pms) unless exists $pms->{mime_body_html_count};
112 return ($pms->{mime_body_html_count} > 0);
113 }
114
115 # HTML without some other type of MIME text part
116 sub check_for_mime_html_only {
117 my ($self, $pms) = @_;
118
119 my $ctype = $pms->get('Content-Type');
120 return 1 if $ctype =~ m{^text/html}i;
121
122 $self->_check_attachments($pms) unless exists $pms->{mime_body_html_count};
123 return ($pms->{mime_body_html_count} > 0 &&
124 $pms->{mime_body_text_count} == 0);
125 }
126
127 sub check_mime_multipart_ratio {
128 my ($self, $pms, undef, $min, $max) = @_;
129
130 $self->_check_attachments($pms) unless exists $pms->{mime_multipart_alternative};
131
132 return ($pms->{mime_multipart_ratio} >= $min &&
133 $pms->{mime_multipart_ratio} < $max);
134 }
135
136 sub _check_mime_header {
137 my ($self, $pms, $ctype, $cte, $cd, $charset, $name) = @_;
138
139 $charset ||= '';
140
141 if ($ctype eq 'text/html') {
142 $pms->{mime_body_html_count}++;
143 }
144 elsif ($ctype =~ m@^text@i) {
145 $pms->{mime_body_text_count}++;
146 }
147
148 if ($cte =~ /base64/) {
149 $pms->{mime_base64_count}++;
150 }
151 elsif ($cte =~ /quoted-printable/) {
152 $pms->{mime_qp_count}++;
153 }
154
155 if ($cd && $cd =~ /attachment/) {
156 $pms->{mime_attachment}++;
157 }
158
159 if ($ctype =~ /^text/ &&
160 $cte =~ /base64/ &&
161 (!$charset || $charset =~ /(?:us-ascii|ansi_x3\.4-1968|iso-ir-6|ansi_x3\.4-1986|iso_646\.irv:1991|ascii|iso646-us|us|ibm367|cp367|csascii)/) &&
162 !($cd && $cd =~ /^(?:attachment|inline)/))
163 {
164 $pms->{mime_base64_encoded_text} = 1;
165 }
166
167 if ($charset =~ /iso-\S+-\S+\b/i &&
168 $charset !~ /iso-(?:8859-\d{1,2}|2022-(?:jp|kr))\b/)
169 {
170 $pms->{mime_bad_iso_charset} = 1;
171 }
172
173 # MIME_BASE64_LATIN: now a zero-hitter
174 # if (!$name &&
175 # $cte =~ /base64/ &&
176 # $charset =~ /\b(?:us-ascii|iso-8859-(?:[12349]|1[0345])|windows-(?:125[0247]))\b/)
177 # {
178 # $pms->{mime_base64_latin} = 1;
179 # }
180
181 # MIME_QP_NO_CHARSET: now a zero-hitter
182 # if ($cte =~ /quoted-printable/ && $cd =~ /inline/ && !$charset) {
183 # $pms->{mime_qp_inline_no_charset} = 1;
184 # }
185
186 # MIME_HTML_NO_CHARSET: now a zero-hitter
187 # if ($ctype eq 'text/html' &&
188 # !(defined($charset) && $charset) &&
189 # !($cd && $cd =~ /^(?:attachment|inline)/))
190 # {
191 # $pms->{mime_html_no_charset} = 1;
192 # }
193
194 if ($charset =~ /[a-z]/i) {
195 if (defined $pms->{mime_html_charsets}) {
196 $pms->{mime_html_charsets} .= " ".$charset;
197 } else {
198 $pms->{mime_html_charsets} = $charset;
199 }
200
201 if (! $pms->{mime_faraway_charset}) {
202 my @l = Mail::SpamAssassin::Util::get_my_locales($self->{main}->{conf}->{ok_locales});
203
204 if (!(grep { $_ eq "all" } @l) &&
205 !Mail::SpamAssassin::Locales::is_charset_ok_for_locales($charset, @l))
206 {
207 $pms->{mime_faraway_charset} = 1;
208 }
209 }
210 }
211 }
212
213 sub _check_attachments {
214 my ($self, $pms) = @_;
215
216 # MIME status
217 my $where = -1; # -1 = start, 0 = nowhere, 1 = header, 2 = body
218 my $qp_bytes = 0; # total bytes in QP regions
219 my $qp_count = 0; # QP-encoded bytes in QP regions
220 my @part_bytes; # MIME part total bytes
221 my @part_type; # MIME part types
222
223 # MIME header information
224 my $part = -1; # MIME part index
225
226 # indicate the scan has taken place
227 $pms->{mime_checked_attachments} = 1;
228
229 # results
230 $pms->{mime_base64_blanks} = 0;
231 $pms->{mime_base64_count} = 0;
232 $pms->{mime_base64_encoded_text} = 0;
233 # $pms->{mime_base64_illegal} = 0;
234 # $pms->{mime_base64_latin} = 0;
235 $pms->{mime_body_html_count} = 0;
236 $pms->{mime_body_text_count} = 0;
237 $pms->{mime_faraway_charset} = 0;
238 # $pms->{mime_html_no_charset} = 0;
239 $pms->{mime_missing_boundary} = 0;
240 $pms->{mime_multipart_alternative} = 0;
241 $pms->{mime_multipart_ratio} = 1.0;
242 $pms->{mime_qp_count} = 0;
243 # $pms->{mime_qp_illegal} = 0;
244 # $pms->{mime_qp_inline_no_charset} = 0;
245 $pms->{mime_qp_long_line} = 0;
246 $pms->{mime_qp_ratio} = 0;
247
248 # Get all parts ...
249 foreach my $p ($pms->{msg}->find_parts(qr/./)) {
250 # message headers
251 my ($ctype, $boundary, $charset, $name) = Mail::SpamAssassin::Util::parse_content_type($p->get_header("content-type"));
252
253 if ($ctype eq 'multipart/alternative') {
254 $pms->{mime_multipart_alternative} = 1;
255 }
256
257 my $cte = $p->get_header('Content-Transfer-Encoding') || '';
258 chomp($cte = defined($cte) ? lc $cte : "");
259
260 my $cd = $p->get_header('Content-Disposition') || '';
261 chomp($cd = defined($cd) ? lc $cd : "");
262
263 $charset = lc $charset if ($charset);
264 $name = lc $name if ($name);
265
266 $self->_check_mime_header($pms, $ctype, $cte, $cd, $charset, $name);
267
268 # If we're not in a leaf node in the tree, there will be no raw
269 # section, so skip it.
270 if (! $p->is_leaf()) {
271 next;
272 }
273
274 $part++;
275 $part_type[$part] = $ctype;
276 $part_bytes[$part] = 0 if $cd !~ /attachment/;
277
278 my $previous = '';
279 foreach (@{$p->raw()}) {
280 if ($cte =~ /base64/i) {
281 if ($previous =~ /^\s*$/ && /^\s*$/) {
282 $pms->{mime_base64_blanks} = 1;
283 }
284 # MIME_BASE64_ILLEGAL: now a zero-hitter
285 # if (m@[^A-Za-z0-9+/=\n]@ || /=[^=\s]/) {
286 # $pms->{mime_base64_illegal} = 1;
287 # }
288 }
289
290 # if ($pms->{mime_html_no_charset} && $ctype eq 'text/html' && defined $charset) {
291 # $pms->{mime_html_no_charset} = 0;
292 # }
293 if ($pms->{mime_multipart_alternative} && $cd !~ /attachment/ &&
294 ($ctype eq 'text/plain' || $ctype eq 'text/html')) {
295 $part_bytes[$part] += length;
296 }
297
298 if ($where != 1 && $cte eq "quoted-printable" && ! /^SPAM: /) {
299 if (length > 77) {
300 $pms->{mime_qp_long_line} = 1;
301 }
302 $qp_bytes += length;
303
304 # MIME_QP_DEFICIENT: zero-hitter now
305
306 # check for illegal substrings (RFC 2045), hexadecimal values 7F-FF and
307 # control characters other than TAB, or CR and LF as parts of CRLF pairs
308 # if (!$pms->{mime_qp_illegal} && /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]/)
309 # {
310 # $pms->{mime_qp_illegal} = 1;
311 # }
312
313 # count excessive QP bytes
314 if (index($_, '=') != -1) {
315 # whoever wrote this next line is an evil hacker -- jm
316 my $qp = () = m/=(?:09|3[0-9ABCEF]|[2456][0-9A-F]|7[0-9A-E])/g;
317 if ($qp) {
318 $qp_count += $qp;
319 # tabs and spaces at end of encoded line are okay. Also, multiple
320 # whitespace at the end of a line are OK, like ">=20=20=20=20=20=20".
321 my ($trailing) = m/((?:=09|=20)+)\s*$/g;
322 if ($trailing) {
323 $qp_count -= (length($trailing) / 3);
324 }
325 }
326 }
327 }
328 $previous = $_;
329 }
330 }
331
332 if ($qp_bytes) {
333 $pms->{mime_qp_ratio} = $qp_count / $qp_bytes;
334 }
335
336 if ($pms->{mime_multipart_alternative}) {
337 my $text;
338 my $html;
339 # bug 4207: we want the size of the last parts
340 for (my $i = $part; $i >= 0; $i--) {
341 next if !defined $part_bytes[$i];
342 if (!defined($html) && $part_type[$i] eq 'text/html') {
343 $html = $part_bytes[$i];
344 }
345 elsif (!defined($text) && $part_type[$i] eq 'text/plain') {
346 $text = $part_bytes[$i];
347 }
348 last if (defined($html) && defined($text));
349 }
350 if (defined($text) && defined($html) && $html > 0) {
351 $pms->{mime_multipart_ratio} = ($text / $html);
352 }
353 }
354
355 # Look to see if any multipart boundaries are not "balanced"
356 foreach my $val (values %{$pms->{msg}->{mime_boundary_state}}) {
357 if ($val != 0) {
358 $pms->{mime_missing_boundary} = 1;
359 last;
360 }
361 }
362 }
363
364 sub check_msg_parse_flags {
365 my($self, $pms, $type, $type2) = @_;
366 $type = $type2 if ref($type);
367 return defined $pms->{msg}->{$type};
368 }
369
370 sub check_for_uppercase {
371 my ($self, $pms, $body, $min, $max) = @_;
372 local ($_);
373
374 if (exists $pms->{uppercase}) {
375 return ($pms->{uppercase} > $min && $pms->{uppercase} <= $max);
376 }
377
378 if ($self->body_charset_is_likely_to_fp($pms)) {
379 $pms->{uppercase} = 0; return 0;
380 }
381
382 # Dec 20 2002 jm: trade off some speed for low memory footprint, by
383 # iterating over the array computing sums, instead of joining the
384 # array into a giant string and working from that.
385
386 my $len = 0;
387 my $lower = 0;
388 my $upper = 0;
389 foreach (@{$body}) {
390 # examine lines in the body that have an intermediate space
391 next unless /\S\s+\S/;
392 # strip out lingering base64 (currently possible for forwarded messages)
393 next if /^(?:[A-Za-z0-9+\/=]{60,76} ){2}/;
394
395 my $line = $_; # copy so we don't muck up the original
396
397 # remove shift-JIS charset codes
398 $line =~ s/\x1b\$B.*\x1b\(B//gs;
399
400 $len += length($line);
401
402 # count numerals as lower case, otherwise 'date|mail' is spam
403 $lower += ($line =~ tr/a-z0-9//d);
404 $upper += ($line =~ tr/A-Z//);
405 }
406
407 # report only on mails above a minimum size; otherwise one
408 # or two acronyms can throw it off
409 if ($len < 200) {
410 $pms->{uppercase} = 0;
411 return 0;
412 }
413 if (($upper + $lower) == 0) {
414 $pms->{uppercase} = 0;
415 } else {
416 $pms->{uppercase} = ($upper / ($upper + $lower)) * 100;
417 }
418
419 return ($pms->{uppercase} > $min && $pms->{uppercase} <= $max);
420 }
421
422 sub body_charset_is_likely_to_fp {
423 my ($self, $pms) = @_;
424
425 # check for charsets where this test will FP -- iso-2022-jp, gb2312,
426 # koi8-r etc.
427 #
428 $self->_check_attachments($pms) unless exists $pms->{mime_checked_attachments};
429 my @charsets;
430 my $type = $pms->get('Content-Type',undef);
431 $type = get_charset_from_ct_line($type) if defined $type;
432 push (@charsets, $type) if defined $type;
433 if (defined $pms->{mime_html_charsets}) {
434 push (@charsets, split(' ', $pms->{mime_html_charsets}));
435 }
436
437 my $CHARSETS_LIKELY_TO_FP_AS_CAPS = CHARSETS_LIKELY_TO_FP_AS_CAPS;
438 foreach my $charset (@charsets) {
439 if ($charset =~ /^${CHARSETS_LIKELY_TO_FP_AS_CAPS}$/) {
440 return 1;
441 }
442 }
443 return 0;
444 }
445
446 sub get_charset_from_ct_line {
447 my $type = shift;
448 if (!defined $type) { return undef; }
449 if ($type =~ /charset="([^"]+)"/i) { return $1; }
450 if ($type =~ /charset='([^']+)'/i) { return $1; }
451 if ($type =~ /charset=(\S+)/i) { return $1; }
452 return undef;
453 }
454
455 # came up on the users@ list, look for multipart/alternative parts which
456 # include non-text parts -- skip certain types which occur normally in ham
457 sub check_ma_non_text {
458 my($self, $pms) = @_;
459
460 foreach my $map ($pms->{msg}->find_parts(qr@^multipart/alternative$@i)) {
461 foreach my $p ($map->find_parts(qr/./, 1, 0)) {
462 next if (lc $p->{'type'} eq 'multipart/related');
463 next if (lc $p->{'type'} eq 'application/rtf');
464 next if ($p->{'type'} =~ m@^text/@i);
465 return 1;
466 }
467 }
468
469 return 0;
470 }
471
472 sub check_base64_length {
473 my $self = shift;
474 my $pms = shift;
475 shift; # body array, unnecessary
476 my $min = shift;
477 my $max = shift;
478
479 if (!defined $pms->{base64_length}) {
480 $pms->{base64_length} = $self->_check_base64_length($pms->{msg});
481 }
482
483 return 0 if (defined $max && $pms->{base64_length} > $max);
484 return $pms->{base64_length} >= $min;
485 }
486
487 sub _check_base64_length {
488 my $self = shift;
489 my $msg = shift;
490
491 my $result = 0;
492
493 foreach my $p ($msg->find_parts(qr@.@, 1)) {
494 my $ctype=
495 Mail::SpamAssassin::Util::parse_content_type($p->get_header('content-type'));
496
497 # FPs from Google Calendar invites, etc.
498 # perhaps just limit to test, and image?
499 next if ($ctype eq 'application/ics');
500
501 my $cte = lc $p->get_header('content-transfer-encoding') || '';
502 next if ($cte !~ /^base64$/);
503 foreach my $l ( @{$p->raw()} ) {
504 my $len = length $l;
505 $result = $len if ($len > $result);
506 }
507 }
508
509 return $result;
510 }
511
512 1;

apache@apache.org
ViewVC Help
Powered by ViewVC 1.1.2