You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/11/27 00:32:04 UTC
svn commit: r106669 - /spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm
Author: felicity
Date: Fri Nov 26 15:32:02 2004
New Revision: 106669
URL: http://svn.apache.org/viewcvs?view=rev&rev=106669
Log:
bug 3992: speed increase for RegistrarBoundaries::split_domain(). Instead of doing everything with regexp, use a hash and slightly alter how we find the domain. Cuts out all the backtracking involved with the RE version, testing shows ~20% speed increase.
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm
Url: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm?view=diff&rev=106669&p1=spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm&r1=106668&p2=spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm&r2=106669
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm Fri Nov 26 15:32:02 2004
@@ -23,953 +23,161 @@
use bytes;
use vars qw (
- @ISA $TWO_LEVEL_DOMAINS $THREE_LEVEL_DOMAINS $US_STATES $FOUR_LEVEL_DOMAINS
- $VALID_TLDS
+ @ISA %TWO_LEVEL_DOMAINS %US_STATES %VALID_TLDS
);
# The list of currently-valid TLDs for the DNS system.
#
-$VALID_TLDS = qr{ (?:
# http://www.iana.org/cctld/cctld-whois.htm
- ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|az|ax|ba|bb|bd|be|bf|bg|bh|bi|
- bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|
- cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|fi|fj|fk|fm|fo|fr|ga|gb|gd|
- ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|
- in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|
- lr|ls|lt|lu|lv|ly|ma|mc|md|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|
- mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|
- pw|py|qa|re|ro|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|sv|sy|sz|
- tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|
- ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw|
-
- # Extra from http://www.iana.org/root-whois/
- su|
-
+ # "su" Extra from http://www.iana.org/root-whois/
# http://www.iana.org/gtld/gtld.htm
- aero| biz| com| coop| info| museum| name| net| org| pro| gov| edu| mil| int|
-
# http://www.iana.org/arpa-dom/
- arpa|
-
- # just in case... futureproofing
+ # "eu" just in case, for the future
+foreach (qw/
+ ac ad ae af ag ai al am an ao aq ar as at au aw az ax ba bb bd be bf bg bh bi
+ bj bm bn bo br bs bt bv bw by bz ca cc cd cf cg ch ci ck cl cm cn co cr cs cu
+ cv cx cy cz de dj dk dm do dz ec ee eg eh er es et fi fj fk fm fo fr ga gb gd
+ ge gf gg gh gi gl gm gn gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il im
+ in io iq ir is it je jm jo jp ke kg kh ki km kn kp kr kw ky kz la lb lc li lk
+ lr ls lt lu lv ly ma mc md mg mh mk ml mm mn mo mp mq mr ms mt mu mv mw mx my
+ mz na nc ne nf ng ni nl no np nr nu nz om pa pe pf pg ph pk pl pm pn pr ps pt
+ pw py qa re ro ru rw sa sb sc sd se sg sh si sj sk sl sm sn so sr st sv sy sz
+ tc td tf tg th tj tk tl tm tn to tp tr tt tv tw tz ua ug uk um us uy uz va vc
+ ve vg vi vn vu wf ws ye yt yu za zm zw
+ su
+ aero biz com coop info museum name net org pro gov edu mil int
+ arpa
eu
-
- )
-}ix;
-
-# This is required because the .us domain is nuts. See $THREE_LEVEL_DOMAINS
-# and $FOUR_LEVEL_DOMAINS below.
-#
-$US_STATES = qr{ (?:
- ak|al|ar|az|ca|co|ct|dc|de|fl|ga|gu|hi|ia|id|il|in|ks|ky|la|ma|md|me|mi|
- mn|mo|ms|mt|nc|nd|ne|nh|nj|nm|nv|ny|oh|ok|or|pa|pr|ri|sc|sd|tn|tx|ut|va|vi|
- vt|wa|wi|wv|wy )
-}ix;
-
-$FOUR_LEVEL_DOMAINS = qr( (?:
- # http://www.neustar.us/policies/docs/rfc_1480.txt
- # "Fire-Dept.CI.Los-Angeles.CA.US"
- # "<school-name>.PVT.K12.<state>.US"
-
- pvt\.k12\.${US_STATES}\.us|
- c[io]\.[^\.]+\.${US_STATES}\.us
-)
-)ix;
-
-$THREE_LEVEL_DOMAINS = qr( (?:
- demon\.co\.uk |
- esc\.edu\.ar |
-
- # http://www.neustar.us/policies/docs/rfc_1480.txt
- [^\.]+\.${US_STATES}\.us )
-)ix;
-
-$TWO_LEVEL_DOMAINS = qr{ (?:
+ /) {
+ $VALID_TLDS{$_} = 1;
+}
# http://www.neustar.us/policies/docs/rfc_1480.txt
-
- fed\.us |
- dni\.us |
-
# data from http://spamcheck.freeapp.net/two-level-tlds , in turn from
# http://www.bestregistrar.com/help/ccTLD.htm
# http://www.hkdnr.net.hk/instructions/new_domain.html
+foreach(qw/
+ fed.us dni.us
+ com.ac edu.ac gov.ac mil.ac net.ac org.ac
+ ac.ae co.ae com.ae gov.ae net.ae org.ae pro.ae sch.ae
+ com.ai edu.ai gov.ai org.ai
+ com.ar edu.ar gov.ar int.ar mil.ar net.ar org.ar uba.ar
+ e164.arpa
+ ac.at co.at gv.at or.at priv.at
+ asn.au com.au conf.au csiro.au edu.au gov.au id.au info.au net.au org.au otc.au oz.au telememo.au
+ com.az net.az org.az
+ com.bb net.bb org.bb
+ ac.be belgie.be dns.be fgov.be
+ com.bh edu.bh gov.bh net.bh org.bh
+ com.bm edu.bm gov.bm net.bm org.bm
+ art.br com.br etc.br g12.br gov.br ind.br inf.br mil.br net.br org.br psi.br rec.br sp.br tmp.br
+ com.bs net.bs org.bs
+ ab.ca bc.ca mb.ca nb.ca nf.ca nl.ca ns.ca nt.ca nu.ca on.ca pe.ca qc.ca sk.ca yk.ca
+ co.ck edu.ck gov.ck net.ck org.ck
+ ac.cn ah.cn bj.cn com.cn cq.cn edu.cn gd.cn gov.cn gs.cn gx.cn gz.cn hb.cn he.cn hi.cn hk.cn hl.cn hn.cn jl.cn js.cn ln.cn mo.cn net.cn nm.cn nx.cn org.cn qh.cn sc.cn sh.cn sn.cn sx.cn tj.cn tw.cn xj.cn xz.cn yn.cn zj.cn
+ arts.co com.co edu.co firm.co gov.co info.co int.co mil.co nom.co org.co rec.co store.co web.co
+ lkd.co.im plc.co.im
+ au.com br.com cn.com de.com eu.com gb.com hu.com no.com qc.com ru.com sa.com se.com uk.com us.com uy.com za.com
+ ac.cr co.cr ed.cr fi.cr go.cr or.cr sa.cr
+ com.cu net.cu org.cu
+ ac.cy com.cy gov.cy net.cy org.cy
+ co.dk
+ art.do com.do edu.do gov.do mil.do net.do org.do web.do
+ art.dz ass.dz com.dz edu.dz gov.dz net.dz org.dz pol.dz
+ com.ec edu.ec fin.ec gov.ec k12.ec med.ec mil.ec net.ec org.ec
+ com.eg edu.eg eun.eg gov.eg net.eg org.eg sci.eg
+ biz.et com.et edu.et gov.et info.et name.et net.et org.et
+ ac.fj com.fj gov.fj id.fj org.fj school.fj
+ ac.fk com.fk gov.fk net.fk nom.fk org.fk
+ aeroport.fr assedic.fr asso.fr avocat.fr avoues.fr barreau.fr cci.fr chambagri.fr chirurgiens-dentistes.fr com.fr experts-comptables.fr geometre-expert.fr gouv.fr greta.fr huissier-justice.fr medecin.fr nom.fr notaires.fr pharmacien.fr port.fr prd.fr presse.fr tm.fr veterinaire.fr
+ com.ge edu.ge gov.ge mil.ge net.ge org.ge pvt.ge
+ ac.gg alderney.gg co.gg gov.gg guernsey.gg ind.gg ltd.gg net.gg org.gg sark.gg sch.gg
+ com.gu edu.gu gov.gu mil.gu net.gu org.gu
+ com.hk edu.hk gov.hk idv.hk net.hk org.hk
+ 2000.hu agrar.hu bolt.hu casino.hu city.hu co.hu erotica.hu erotika.hu film.hu forum.hu games.hu hotel.hu info.hu ingatlan.hu jogasz.hu konyvelo.hu lakas.hu media.hu news.hu org.hu priv.hu reklam.hu sex.hu shop.hu sport.hu suli.hu szex.hu tm.hu tozsde.hu utazas.hu video.hu
+ ac.id co.id go.id mil.id net.id or.id
+ ac.il co.il gov.il idf.il k12.il muni.il net.il org.il
+ ac.im co.im gov.im net.im nic.im org.im
+ ac.in co.in ernet.in firm.in gen.in gov.in ind.in mil.in net.in nic.in org.in res.in
+ ac.je co.je gov.je ind.je jersey.je ltd.je net.je org.je sch.je
+ com.jo edu.jo gov.jo mil.jo net.jo org.jo
+ ac.jp ad.jp aichi.jp akita.jp aomori.jp chiba.jp co.jp ed.jp ehime.jp fukui.jp fukuoka.jp fukushima.jp gifu.jp go.jp gov.jp gr.jp gunma.jp hiroshima.jp hokkaido.jp hyogo.jp ibaraki.jp ishikawa.jp iwate.jp kagawa.jp kagoshima.jp kanagawa.jp kanazawa.jp kawasaki.jp kitakyushu.jp kobe.jp kochi.jp kumamoto.jp kyoto.jp lg.jp matsuyama.jp mie.jp miyagi.jp miyazaki.jp nagano.jp nagasaki.jp nagoya.jp nara.jp ne.jp net.jp niigata.jp oita.jp okayama.jp okinawa.jp org.jp or.jp osaka.jp saga.jp saitama.jp sapporo.jp sendai.jp shiga.jp shimane.jp shizuoka.jp takamatsu.jp tochigi.jp tokushima.jp tokyo.jp tottori.jp toyama.jp utsunomiya.jp wakayama.jp yamagata.jp yamaguchi.jp yamanashi.jp yokohama.jp
+ com.kh edu.kh gov.kh mil.kh net.kh org.kh per.kh
+ ac.kr co.kr go.kr kyonggi.kr ne.kr or.kr pe.kr re.kr seoul.kr
+ com.kw edu.kw gov.kw net.kw org.kw
+ com.la net.la org.la
+ com.lb edu.lb gov.lb mil.lb net.lb org.lb
+ com.lc edu.lc gov.lc net.lc org.lc
+ asn.lv com.lv conf.lv edu.lv gov.lv id.lv mil.lv net.lv org.lv
+ com.ly net.ly org.ly
+ ac.ma co.ma net.ma org.ma press.ma
+ com.mk
+ com.mm edu.mm gov.mm net.mm org.mm
+ com.mo edu.mo gov.mo net.mo org.mo
+ com.mt edu.mt net.mt org.mt tm.mt uu.mt
+ com.mx net.mx org.mx
+ com.my edu.my gov.my net.my org.my
+ alt.na com.na cul.na edu.na net.na org.na telecom.na unam.na
+ com.nc net.nc org.nc
+ de.net gb.net uk.net
+ ac.ng com.ng edu.ng gov.ng net.ng org.ng sch.ng
+ com.ni edu.ni gob.ni net.ni nom.ni org.ni
+ tel.no
+ com.np edu.np gov.np net.np org.np
+ fax.nr mobile.nr mobil.nr mob.nr tel.nr tlf.nr
+ ac.nz co.nz cri.nz geek.nz gen.nz govt.nz iwi.nz maori.nz mil.nz net.nz org.nz school.nz
+ ac.om biz.om com.om co.om edu.om gov.om med.om mod.om museum.om net.om org.om pro.om
+ dk.org eu.org
+ ac.pa com.pa edu.pa gob.pa net.pa org.pa sld.pa
+ com.pe edu.pe gob.pe mil.pe net.pe nom.pe org.pe
+ ac.pg com.pg net.pg
+ com.ph mil.ph net.ph ngo.ph org.ph
+ biz.pk com.pk edu.pk fam.pk gob.pk gok.pk gon.pk gop.pk gos.pk gov.pk net.pk org.pk web.pk
+ agro.pl aid.pl atm.pl auto.pl biz.pl com.pl edu.pl gmina.pl gsm.pl info.pl mail.pl media.pl miasta.pl mil.pl net.pl nieruchomosci.pl nom.pl org.pl pc.pl powiat.pl priv.pl realestate.pl rel.pl sex.pl shop.pl sklep.pl sos.pl szkola.pl targi.pl tm.pl tourism.pl travel.pl turystyka.pl
+ edu.ps gov.ps plo.ps sec.ps
+ com.py edu.py net.py org.py
+ com.qa edu.qa gov.qa net.qa org.qa
+ asso.re com.re nom.re
+ com.ru net.ru org.ru pp.ru
+ com.sa edu.sa gov.sa med.sa net.sa org.sa pub.sa sch.sa
+ com.sb edu.sb gov.sb net.sb org.sb
+ com.sd edu.sd gov.sd med.sd net.sd org.sd sch.sd
+ com.sg edu.sg gov.sg net.sg org.sg per.sg
+ com.sh edu.sh gov.sh mil.sh net.sh org.sh
+ com.sv edu.sv gob.sv org.sv red.sv
+ com.sy gov.sy net.sy org.sy
+ ac.th co.th go.th net.th or.th
+ com.tn edunet.tn ens.tn fin.tn gov.tn ind.tn info.tn intl.tn nat.tn net.tn org.tn rnrt.tn rns.tn rnu.tn tourism.tn
+ bbs.tr com.tr edu.tr gen.tr gov.tr k12.tr mil.tr net.tr org.tr
+ at.tt au.tt be.tt biz.tt ca.tt com.tt co.tt de.tt dk.tt edu.tt es.tt eu.tt fr.tt gov.tt info.tt it.tt name.tt net.tt nic.tt org.tt pro.tt se.tt uk.tt us.tt
+ co.tv
+ com.tw edu.tw gove.tw idv.tw net.tw org.tw
+ com.ua edu.ua gov.ua net.ua org.ua
+ ac.ug co.ug go.ug or.ug
+ ac.uk co.uk edu.uk gov.uk ltd.uk me.uk mod.uk net.uk nhs.uk nic.uk org.uk plc.uk police.uk sch.uk
+ com.uy edu.uy gub.uy mil.uy net.uy org.uy
+ arts.ve bib.ve com.ve co.ve edu.ve firm.ve gov.ve info.ve int.ve mil.ve net.ve nom.ve org.ve rec.ve store.ve tec.ve web.ve
+ co.vi net.vi org.vi
+ ac.vn biz.vn com.vn edu.vn gov.vn health.vn info.vn int.vn name.vn net.vn org.vn pro.vn ch.vu com.vu de.vu edu.vu fr.vu net.vu org.vu
+ com.ws edu.ws gov.ws net.ws org.ws
+ com.ye edu.ye gov.ye mil.ye net.ye org.ye
+ ac.yu co.yu edu.yu org.yu
+ ac.za alt.za bourse.za city.za co.za edu.za gov.za law.za mil.za net.za ngo.za nom.za org.za school.za tm.za web.za
+ ac.zw co.zw gov.zw org.zw
+ /) {
+ $TWO_LEVEL_DOMAINS{$_} = 1;
+}
- com\.ac |
- edu\.ac |
- gov\.ac |
- mil\.ac |
- net\.ac |
- org\.ac |
- ac\.ae |
- co\.ae |
- com\.ae |
- gov\.ae |
- net\.ae |
- org\.ae |
- pro\.ae |
- sch\.ae |
- com\.ai |
- edu\.ai |
- gov\.ai |
- org\.ai |
- com\.ar |
- edu\.ar |
- gov\.ar |
- int\.ar |
- mil\.ar |
- net\.ar |
- org\.ar |
- uba\.ar |
- e164\.arpa |
- ac\.at |
- co\.at |
- gv\.at |
- or\.at |
- priv\.at |
- asn\.au |
- com\.au |
- conf\.au |
- csiro\.au |
- edu\.au |
- gov\.au |
- id\.au |
- info\.au |
- net\.au |
- org\.au |
- otc\.au |
- oz\.au |
- telememo\.au |
- com\.az |
- net\.az |
- org\.az |
- com\.bb |
- net\.bb |
- org\.bb |
- ac\.be |
- belgie\.be |
- dns\.be |
- fgov\.be |
- com\.bh |
- edu\.bh |
- gov\.bh |
- net\.bh |
- org\.bh |
- com\.bm |
- edu\.bm |
- gov\.bm |
- net\.bm |
- org\.bm |
- art\.br |
- com\.br |
- etc\.br |
- g12\.br |
- gov\.br |
- ind\.br |
- inf\.br |
- mil\.br |
- net\.br |
- org\.br |
- psi\.br |
- rec\.br |
- sp\.br |
- tmp\.br |
- com\.bs |
- net\.bs |
- org\.bs |
- ab\.ca |
- bc\.ca |
- mb\.ca |
- nb\.ca |
- nf\.ca |
- nl\.ca |
- ns\.ca |
- nt\.ca |
- nu\.ca |
- on\.ca |
- pe\.ca |
- qc\.ca |
- sk\.ca |
- yk\.ca |
- co\.ck |
- edu\.ck |
- gov\.ck |
- net\.ck |
- org\.ck |
- ac\.cn |
- ah\.cn |
- bj\.cn |
- com\.cn |
- cq\.cn |
- edu\.cn |
- gd\.cn |
- gov\.cn |
- gs\.cn |
- gx\.cn |
- gz\.cn |
- hb\.cn |
- he\.cn |
- hi\.cn |
- hk\.cn |
- hl\.cn |
- hn\.cn |
- jl\.cn |
- js\.cn |
- ln\.cn |
- mo\.cn |
- net\.cn |
- nm\.cn |
- nx\.cn |
- org\.cn |
- qh\.cn |
- sc\.cn |
- sh\.cn |
- sn\.cn |
- sx\.cn |
- tj\.cn |
- tw\.cn |
- xj\.cn |
- xz\.cn |
- yn\.cn |
- zj\.cn |
- arts\.co |
- com\.co |
- edu\.co |
- firm\.co |
- gov\.co |
- info\.co |
- int\.co |
- mil\.co |
- nom\.co |
- org\.co |
- rec\.co |
- store\.co |
- web\.co |
- lkd\.co\.im |
- plc\.co\.im |
- au\.com |
- br\.com |
- cn\.com |
- de\.com |
- eu\.com |
- gb\.com |
- hu\.com |
- no\.com |
- qc\.com |
- ru\.com |
- sa\.com |
- se\.com |
- uk\.com |
- us\.com |
- uy\.com |
- za\.com |
- ac\.cr |
- co\.cr |
- ed\.cr |
- fi\.cr |
- go\.cr |
- or\.cr |
- sa\.cr |
- com\.cu |
- net\.cu |
- org\.cu |
- ac\.cy |
- com\.cy |
- gov\.cy |
- net\.cy |
- org\.cy |
- co\.dk |
- art\.do |
- com\.do |
- edu\.do |
- gov\.do |
- mil\.do |
- net\.do |
- org\.do |
- web\.do |
- art\.dz |
- ass\.dz |
- com\.dz |
- edu\.dz |
- gov\.dz |
- net\.dz |
- org\.dz |
- pol\.dz |
- com\.ec |
- edu\.ec |
- fin\.ec |
- gov\.ec |
- k12\.ec |
- med\.ec |
- mil\.ec |
- net\.ec |
- org\.ec |
- com\.eg |
- edu\.eg |
- eun\.eg |
- gov\.eg |
- net\.eg |
- org\.eg |
- sci\.eg |
- biz\.et |
- com\.et |
- edu\.et |
- gov\.et |
- info\.et |
- name\.et |
- net\.et |
- org\.et |
- ac\.fj |
- com\.fj |
- gov\.fj |
- id\.fj |
- org\.fj |
- school\.fj |
- ac\.fk |
- com\.fk |
- gov\.fk |
- net\.fk |
- nom\.fk |
- org\.fk |
- aeroport\.fr |
- assedic\.fr |
- asso\.fr |
- avocat\.fr |
- avoues\.fr |
- barreau\.fr |
- cci\.fr |
- chambagri\.fr |
- chirurgiens-dentistes\.fr |
- com\.fr |
- experts-comptables\.fr |
- geometre-expert\.fr |
- gouv\.fr |
- greta\.fr |
- huissier-justice\.fr |
- medecin\.fr |
- nom\.fr |
- notaires\.fr |
- pharmacien\.fr |
- port\.fr |
- prd\.fr |
- presse\.fr |
- tm\.fr |
- veterinaire\.fr |
- com\.ge |
- edu\.ge |
- gov\.ge |
- mil\.ge |
- net\.ge |
- org\.ge |
- pvt\.ge |
- ac\.gg |
- alderney\.gg |
- co\.gg |
- gov\.gg |
- guernsey\.gg |
- ind\.gg |
- ltd\.gg |
- net\.gg |
- org\.gg |
- sark\.gg |
- sch\.gg |
- com\.gu |
- edu\.gu |
- gov\.gu |
- mil\.gu |
- net\.gu |
- org\.gu |
- com\.hk |
- edu\.hk |
- gov\.hk |
- idv\.hk |
- net\.hk |
- org\.hk |
- 2000\.hu |
- agrar\.hu |
- bolt\.hu |
- casino\.hu |
- city\.hu |
- co\.hu |
- erotica\.hu |
- erotika\.hu |
- film\.hu |
- forum\.hu |
- games\.hu |
- hotel\.hu |
- info\.hu |
- ingatlan\.hu |
- jogasz\.hu |
- konyvelo\.hu |
- lakas\.hu |
- media\.hu |
- news\.hu |
- org\.hu |
- priv\.hu |
- reklam\.hu |
- sex\.hu |
- shop\.hu |
- sport\.hu |
- suli\.hu |
- szex\.hu |
- tm\.hu |
- tozsde\.hu |
- utazas\.hu |
- video\.hu |
- ac\.id |
- co\.id |
- go\.id |
- mil\.id |
- net\.id |
- or\.id |
- ac\.il |
- co\.il |
- gov\.il |
- idf\.il |
- k12\.il |
- muni\.il |
- net\.il |
- org\.il |
- ac\.im |
- co\.im |
- gov\.im |
- net\.im |
- nic\.im |
- org\.im |
- ac\.in |
- co\.in |
- ernet\.in |
- firm\.in |
- gen\.in |
- gov\.in |
- ind\.in |
- mil\.in |
- net\.in |
- nic\.in |
- org\.in |
- res\.in |
- ac\.je |
- co\.je |
- gov\.je |
- ind\.je |
- jersey\.je |
- ltd\.je |
- net\.je |
- org\.je |
- sch\.je |
- com\.jo |
- edu\.jo |
- gov\.jo |
- mil\.jo |
- net\.jo |
- org\.jo |
- ac\.jp |
- ad\.jp |
- aichi\.jp |
- akita\.jp |
- aomori\.jp |
- chiba\.jp |
- co\.jp |
- ed\.jp |
- ehime\.jp |
- fukui\.jp |
- fukuoka\.jp |
- fukushima\.jp |
- gifu\.jp |
- go\.jp |
- gov\.jp |
- gr\.jp |
- gunma\.jp |
- hiroshima\.jp |
- hokkaido\.jp |
- hyogo\.jp |
- ibaraki\.jp |
- ishikawa\.jp |
- iwate\.jp |
- kagawa\.jp |
- kagoshima\.jp |
- kanagawa\.jp |
- kanazawa\.jp |
- kawasaki\.jp |
- kitakyushu\.jp |
- kobe\.jp |
- kochi\.jp |
- kumamoto\.jp |
- kyoto\.jp |
- lg\.jp |
- matsuyama\.jp |
- mie\.jp |
- miyagi\.jp |
- miyazaki\.jp |
- nagano\.jp |
- nagasaki\.jp |
- nagoya\.jp |
- nara\.jp |
- ne\.jp |
- net\.jp |
- niigata\.jp |
- oita\.jp |
- okayama\.jp |
- okinawa\.jp |
- org\.jp |
- or\.jp |
- osaka\.jp |
- saga\.jp |
- saitama\.jp |
- sapporo\.jp |
- sendai\.jp |
- shiga\.jp |
- shimane\.jp |
- shizuoka\.jp |
- takamatsu\.jp |
- tochigi\.jp |
- tokushima\.jp |
- tokyo\.jp |
- tottori\.jp |
- toyama\.jp |
- utsunomiya\.jp |
- wakayama\.jp |
- yamagata\.jp |
- yamaguchi\.jp |
- yamanashi\.jp |
- yokohama\.jp |
- com\.kh |
- edu\.kh |
- gov\.kh |
- mil\.kh |
- net\.kh |
- org\.kh |
- per\.kh |
- ac\.kr |
- co\.kr |
- go\.kr |
- kyonggi\.kr |
- ne\.kr |
- or\.kr |
- pe\.kr |
- re\.kr |
- seoul\.kr |
- com\.kw |
- edu\.kw |
- gov\.kw |
- net\.kw |
- org\.kw |
- com\.la |
- net\.la |
- org\.la |
- com\.lb |
- edu\.lb |
- gov\.lb |
- mil\.lb |
- net\.lb |
- org\.lb |
- com\.lc |
- edu\.lc |
- gov\.lc |
- net\.lc |
- org\.lc |
- asn\.lv |
- com\.lv |
- conf\.lv |
- edu\.lv |
- gov\.lv |
- id\.lv |
- mil\.lv |
- net\.lv |
- org\.lv |
- com\.ly |
- net\.ly |
- org\.ly |
- ac\.ma |
- co\.ma |
- net\.ma |
- org\.ma |
- press\.ma |
- com\.mk |
- com\.mm |
- edu\.mm |
- gov\.mm |
- net\.mm |
- org\.mm |
- com\.mo |
- edu\.mo |
- gov\.mo |
- net\.mo |
- org\.mo |
- com\.mt |
- edu\.mt |
- net\.mt |
- org\.mt |
- tm\.mt |
- uu\.mt |
- com\.mx |
- net\.mx |
- org\.mx |
- com\.my |
- edu\.my |
- gov\.my |
- net\.my |
- org\.my |
- alt\.na |
- com\.na |
- cul\.na |
- edu\.na |
- net\.na |
- org\.na |
- telecom\.na |
- unam\.na |
- com\.nc |
- net\.nc |
- org\.nc |
- de\.net |
- gb\.net |
- uk\.net |
- ac\.ng |
- com\.ng |
- edu\.ng |
- gov\.ng |
- net\.ng |
- org\.ng |
- sch\.ng |
- com\.ni |
- edu\.ni |
- gob\.ni |
- net\.ni |
- nom\.ni |
- org\.ni |
- tel\.no |
- com\.np |
- edu\.np |
- gov\.np |
- net\.np |
- org\.np |
- fax\.nr |
- mobile\.nr |
- mobil\.nr |
- mob\.nr |
- tel\.nr |
- tlf\.nr |
- ac\.nz |
- co\.nz |
- cri\.nz |
- geek\.nz |
- gen\.nz |
- govt\.nz |
- iwi\.nz |
- maori\.nz |
- mil\.nz |
- net\.nz |
- org\.nz |
- school\.nz |
- ac\.om |
- biz\.om |
- com\.om |
- co\.om |
- edu\.om |
- gov\.om |
- med\.om |
- mod\.om |
- museum\.om |
- net\.om |
- org\.om |
- pro\.om |
- dk\.org |
- eu\.org |
- ac\.pa |
- com\.pa |
- edu\.pa |
- gob\.pa |
- net\.pa |
- org\.pa |
- sld\.pa |
- com\.pe |
- edu\.pe |
- gob\.pe |
- mil\.pe |
- net\.pe |
- nom\.pe |
- org\.pe |
- ac\.pg |
- com\.pg |
- net\.pg |
- com\.ph |
- mil\.ph |
- net\.ph |
- ngo\.ph |
- org\.ph |
- biz\.pk |
- com\.pk |
- edu\.pk |
- fam\.pk |
- gob\.pk |
- gok\.pk |
- gon\.pk |
- gop\.pk |
- gos\.pk |
- gov\.pk |
- net\.pk |
- org\.pk |
- web\.pk |
- agro\.pl |
- aid\.pl |
- atm\.pl |
- auto\.pl |
- biz\.pl |
- com\.pl |
- edu\.pl |
- gmina\.pl |
- gsm\.pl |
- info\.pl |
- mail\.pl |
- media\.pl |
- miasta\.pl |
- mil\.pl |
- net\.pl |
- nieruchomosci\.pl |
- nom\.pl |
- org\.pl |
- pc\.pl |
- powiat\.pl |
- priv\.pl |
- realestate\.pl |
- rel\.pl |
- sex\.pl |
- shop\.pl |
- sklep\.pl |
- sos\.pl |
- szkola\.pl |
- targi\.pl |
- tm\.pl |
- tourism\.pl |
- travel\.pl |
- turystyka\.pl |
- edu\.ps |
- gov\.ps |
- plo\.ps |
- sec\.ps |
- com\.py |
- edu\.py |
- net\.py |
- org\.py |
- com\.qa |
- edu\.qa |
- gov\.qa |
- net\.qa |
- org\.qa |
- asso\.re |
- com\.re |
- nom\.re |
- com\.ru |
- net\.ru |
- org\.ru |
- pp\.ru |
- com\.sa |
- edu\.sa |
- gov\.sa |
- med\.sa |
- net\.sa |
- org\.sa |
- pub\.sa |
- sch\.sa |
- com\.sb |
- edu\.sb |
- gov\.sb |
- net\.sb |
- org\.sb |
- com\.sd |
- edu\.sd |
- gov\.sd |
- med\.sd |
- net\.sd |
- org\.sd |
- sch\.sd |
- com\.sg |
- edu\.sg |
- gov\.sg |
- net\.sg |
- org\.sg |
- per\.sg |
- com\.sh |
- edu\.sh |
- gov\.sh |
- mil\.sh |
- net\.sh |
- org\.sh |
- com\.sv |
- edu\.sv |
- gob\.sv |
- org\.sv |
- red\.sv |
- com\.sy |
- gov\.sy |
- net\.sy |
- org\.sy |
- ac\.th |
- co\.th |
- go\.th |
- net\.th |
- or\.th |
- com\.tn |
- edunet\.tn |
- ens\.tn |
- fin\.tn |
- gov\.tn |
- ind\.tn |
- info\.tn |
- intl\.tn |
- nat\.tn |
- net\.tn |
- org\.tn |
- rnrt\.tn |
- rns\.tn |
- rnu\.tn |
- tourism\.tn |
- bbs\.tr |
- com\.tr |
- edu\.tr |
- gen\.tr |
- gov\.tr |
- k12\.tr |
- mil\.tr |
- net\.tr |
- org\.tr |
- at\.tt |
- au\.tt |
- be\.tt |
- biz\.tt |
- ca\.tt |
- com\.tt |
- co\.tt |
- de\.tt |
- dk\.tt |
- edu\.tt |
- es\.tt |
- eu\.tt |
- fr\.tt |
- gov\.tt |
- info\.tt |
- it\.tt |
- name\.tt |
- net\.tt |
- nic\.tt |
- org\.tt |
- pro\.tt |
- se\.tt |
- uk\.tt |
- us\.tt |
- co\.tv |
- com\.tw |
- edu\.tw |
- gove\.tw |
- idv\.tw |
- net\.tw |
- org\.tw |
- com\.ua |
- edu\.ua |
- gov\.ua |
- net\.ua |
- org\.ua |
- ac\.ug |
- co\.ug |
- go\.ug |
- or\.ug |
- ac\.uk |
- co\.uk |
- edu\.uk |
- gov\.uk |
- ltd\.uk |
- me\.uk |
- mod\.uk |
- net\.uk |
- nhs\.uk |
- nic\.uk |
- org\.uk |
- plc\.uk |
- police\.uk |
- sch\.uk |
- com\.uy |
- edu\.uy |
- gub\.uy |
- mil\.uy |
- net\.uy |
- org\.uy |
- arts\.ve |
- bib\.ve |
- com\.ve |
- co\.ve |
- edu\.ve |
- firm\.ve |
- gov\.ve |
- info\.ve |
- int\.ve |
- mil\.ve |
- net\.ve |
- nom\.ve |
- org\.ve |
- rec\.ve |
- store\.ve |
- tec\.ve |
- web\.ve |
- co\.vi |
- net\.vi |
- org\.vi |
- ac\.vn |
- biz\.vn |
- com\.vn |
- edu\.vn |
- gov\.vn |
- health\.vn |
- info\.vn |
- int\.vn |
- name\.vn |
- net\.vn |
- org\.vn |
- pro\.vn |
- ch\.vu |
- com\.vu |
- de\.vu |
- edu\.vu |
- fr\.vu |
- net\.vu |
- org\.vu |
- com\.ws |
- edu\.ws |
- gov\.ws |
- net\.ws |
- org\.ws |
- com\.ye |
- edu\.ye |
- gov\.ye |
- mil\.ye |
- net\.ye |
- org\.ye |
- ac\.yu |
- co\.yu |
- edu\.yu |
- org\.yu |
- ac\.za |
- alt\.za |
- bourse\.za |
- city\.za |
- co\.za |
- edu\.za |
- gov\.za |
- law\.za |
- mil\.za |
- net\.za |
- ngo\.za |
- nom\.za |
- org\.za |
- school\.za |
- tm\.za |
- web\.za |
- ac\.zw |
- co\.zw |
- gov\.zw |
- org\.zw
-
- )
-}ix;
+# This is required because the .us domain is nuts. See $THREE_LEVEL_DOMAINS
+# and $FOUR_LEVEL_DOMAINS below.
+#
+foreach (qw/
+ ak al ar az ca co ct dc de fl ga gu hi ia id il in ks ky la ma md me mi
+ mn mo ms mt nc nd ne nh nj nm nv ny oh ok or pa pr ri sc sd tn tx ut va vi
+ vt wa wi wv wy
+ /) {
+ $US_STATES{$_} = 1;
+}
###########################################################################
@@ -986,12 +194,10 @@
=cut
sub split_domain {
- my ($domain) = @_;
+ my $domain = lc $_[0];
my $hostname = '';
if ($domain) {
- my $partsreqd = 2; # default to domain.tld
-
# www..spamassassin.org -> www.spamassassin.org
$domain =~ tr/././s;
@@ -1001,25 +207,49 @@
# Split scalar domain into components
my @domparts = split (/\./, $domain);
+ my @hostname=();
+
+ while (@domparts > 1) { # go until we find the TLD
+ if (@domparts == 4) {
+ if ($domparts[3] eq 'us' && (
+ ($domparts[0] eq 'pvt' && $domparts[1] eq 'k12') ||
+ $domparts[0] =~ /^c[io]$/
+ )) {
+ # http://www.neustar.us/policies/docs/rfc_1480.txt
+ # "Fire-Dept.CI.Los-Angeles.CA.US"
+ # "<school-name>.PVT.K12.<state>.US"
+ last if ($US_STATES{$domparts[2]});
+ }
+ }
+ elsif (@domparts == 3) {
+ # http://www.neustar.us/policies/docs/rfc_1480.txt
+ # demon.co.uk
+ # esc.edu.ar
+ # [^\.]+\.${US_STATES}\.us
+ if ($domparts[2] eq 'uk' || $domparts[2] eq 'ar') {
+ my $temp = join(".", @domparts);
+ last if ($temp eq 'demon.co.uk' || $temp eq 'esc.edu.ar');
+ }
+ elsif ($domparts[2] eq 'us') {
+ last if ($US_STATES{$domparts[1]});
+ }
+ }
+ elsif (@domparts == 2) {
+ # co.uk, etc.
+ my $temp = join(".", @domparts);
+ last if ($TWO_LEVEL_DOMAINS{$temp});
+ }
+ push(@hostname, shift @domparts);
+ }
# Look for a sub-delegated TLD
# use @domparts to skip trying to match on TLDs that can't possibly
# match, but keep in mind that the hostname can be blank, so 4TLD needs 4,
# 3TLD needs 3, 2TLD needs 2 ...
#
- if (@domparts >= 4 && $domain =~ /(?:\.|^)${FOUR_LEVEL_DOMAINS}$/io) # Fire-Dept.CI.Los-Angeles.CA.US
- { $partsreqd = 5; }
- elsif (@domparts >= 3 && $domain =~ /(?:\.|^)${THREE_LEVEL_DOMAINS}$/io) # demon.co.uk
- { $partsreqd = 4; }
- elsif (@domparts >= 2 && $domain =~ /(?:\.|^)${TWO_LEVEL_DOMAINS}$/io) # co.uk
- { $partsreqd = 3; }
-
- if (@domparts >= $partsreqd) {
- # reset the domain to the last $partsreqd parts
- $domain = join(".", splice(@domparts, -$partsreqd));
- # chopped is everything else ...
- $hostname = join(".", @domparts);
- }
+ unshift @domparts, pop @hostname if @hostname;
+ $domain = join(".", @domparts);
+ $hostname = join(".", @hostname);
}
($hostname, $domain);
@@ -1062,7 +292,8 @@
return 0 if ($dom =~ /\s/);
# ensure it ends in a known-valid TLD, and has at least 1 dot
- return 0 if ($dom !~ /\.${VALID_TLDS}$/io);
+ return 0 unless ($dom =~ /\.([^.]+)$/);
+ return 0 unless ($VALID_TLDS{$1});
return 1; # nah, it's ok.
}