Added test cases for regex

This commit is contained in:
Ben Baert 2018-05-31 12:54:21 +02:00
parent eaf2668a84
commit fe914d6f57
4 changed files with 172 additions and 114 deletions

65
email_regex.py Normal file
View File

@ -0,0 +1,65 @@
# All we are really doing is comparing the input string to one
# gigantic regular expression. But building that regexp, and
# ensuring its correctness, is made much easier by assembling it
# from the "tokens" defined by the RFC. Each of these tokens is
# tested in the accompanying unit test file.
#
# The section of RFC 2822 from which each pattern component is
# derived is given in an accompanying comment.
#
# (To make things simple, every string below is given as 'raw',
# even when it's not strictly necessary. This way we don't forget
# when it is necessary.)
#
import re
WSP = r'[\s]' # see 2.2.2. Structured Header Field Bodies
CRLF = r'(?:\r\n)' # see 2.2.3. Long Header Fields
NO_WS_CTL = r'\x01-\x08\x0b\x0c\x0f-\x1f\x7f' # see 3.2.1. Primitive Tokens
QUOTED_PAIR = r'(?:\\.)' # see 3.2.2. Quoted characters
FWS = r'(?:(?:' + WSP + r'*' + CRLF + r')?' + \
WSP + r'+)' # see 3.2.3. Folding white space and comments
CTEXT = r'[' + NO_WS_CTL + \
r'\x21-\x27\x2a-\x5b\x5d-\x7e]' # see 3.2.3
CCONTENT = r'(?:' + CTEXT + r'|' + \
QUOTED_PAIR + r')' # see 3.2.3 (NB: The RFC includes COMMENT here
# as well, but that would be circular.)
COMMENT = r'\((?:' + FWS + r'?' + CCONTENT + \
r')*' + FWS + r'?\)' # see 3.2.3
CFWS = r'(?:' + FWS + r'?' + COMMENT + ')*(?:' + \
FWS + '?' + COMMENT + '|' + FWS + ')' # see 3.2.3
ATEXT = r'[\w!#$%&\'\*\+\-/=\?\^`\{\|\}~]' # see 3.2.4. Atom
ATOM = CFWS + r'?' + ATEXT + r'+' + CFWS + r'?' # see 3.2.4
DOT_ATOM_TEXT = ATEXT + r'+(?:\.' + ATEXT + r'+)*' # see 3.2.4
DOT_ATOM = CFWS + r'?' + DOT_ATOM_TEXT + CFWS + r'?' # see 3.2.4
QTEXT = r'[' + NO_WS_CTL + \
r'\x21\x23-\x5b\x5d-\x7e]' # see 3.2.5. Quoted strings
QCONTENT = r'(?:' + QTEXT + r'|' + \
QUOTED_PAIR + r')' # see 3.2.5
QUOTED_STRING = CFWS + r'?' + r'"(?:' + FWS + \
r'?' + QCONTENT + r')*' + FWS + \
r'?' + r'"' + CFWS + r'?'
LOCAL_PART = r'(?:' + DOT_ATOM + r'|' + \
QUOTED_STRING + r')' # see 3.4.1. Addr-spec specification
DTEXT = r'[' + NO_WS_CTL + r'\x21-\x5a\x5e-\x7e]' # see 3.4.1
DCONTENT = r'(?:' + DTEXT + r'|' + \
QUOTED_PAIR + r')' # see 3.4.1
DOMAIN_LITERAL = CFWS + r'?' + r'\[' + \
r'(?:' + FWS + r'?' + DCONTENT + \
r')*' + FWS + r'?\]' + CFWS + r'?' # see 3.4.1
DOMAIN = r'(?:' + DOT_ATOM + r'|' + \
DOMAIN_LITERAL + r')' # see 3.4.1
ADDR_SPEC = LOCAL_PART + r'@' + DOMAIN # see 3.4.1
VALID_ADDRESS_REGEXP = '^' + ADDR_SPEC + '$'
def email_has_valid_structure(email_address):
if re.match(VALID_ADDRESS_REGEXP, email_address):
return True
return False

3
setup.cfg Normal file
View File

@ -0,0 +1,3 @@
[nosetests]
verbosity=3
with-doctest=1

55
tests.py Normal file
View File

@ -0,0 +1,55 @@
from email_regex import email_has_valid_structure
VALID_EMAIL_ADDRESS_EXAMPLES = [
"email@domain.com", # basic valid email
"firstname.lastname@domain.com", # dot in address field
"email@subdomain.domain.com", # dot in subdomain
"firstname+lastname@domain.com", # + in address field
"email@123.123.123.123", # domain address is IP address
"email@[123.123.123.123]", # square brackets around IP address
"\"email\"@domain.com", # quote marks in address fields
"1234567890@domain.com", # numbers in address field
"email@domain-one.com", # dash in subdomain
"_______@domain.com", # underscore in address field
"email@domain.name", # .name top level domain name
"email@domain.co.jp", # dot in top level domain
"firstname-lastname@domain.com" # dash in address field
]
INVALID_EMAIL_ADDRESS_EXAMPLES = [
"plainaddress", # Missing @ sign and domain,
"#@%^%#$@#$@#.com", # Garbage,
"@domain.com", # Missing username,
"Joe Smith <email@domain.com>", # Encoded html within email is invalid,
"email.domain.com", # Missing @,
"email@domain@domain.com", # Two @ sign,
".email@domain.com", # Leading dot in address is not allowed,
"email.@domain.com", # Trailing dot in address is not allowed,
"email..email@domain.com", # Multiple dots,
"あいうえお@domain.com", # Unicode char as address,
"email@domain.com (Joe Smith)", # Text followed email is not allowed,
"email@domain", # Missing top level domain (.com/.net/.org/etc),
"email@-domain.com", # Leading dash in front of domain is invalid,
"email@domain.web", # .web is not a valid top level domain,
"email@111.222.333.44444", # Invalid IP format,
"email@domain..com", # Multiple dot in the domain portion is invalid
]
def test_valid_email_structure_regex():
for index, valid_email_address in enumerate(VALID_EMAIL_ADDRESS_EXAMPLES):
try:
assert email_has_valid_structure(valid_email_address) is True
except AssertionError:
raise AssertionError(
("{} should be valid ({}th email address in the list)"
.format(valid_email_address, index)))
def test_invalid_email_structure_regex():
for index, invalid_email_address in enumerate(INVALID_EMAIL_ADDRESS_EXAMPLES):
try:
assert email_has_valid_structure(invalid_email_address) is False
except AssertionError:
raise AssertionError(
("{} should be invalid ({}th email address in the list)"
.format(invalid_email_address, index)))

View File

@ -17,80 +17,14 @@
# exception of a circular definition (see comments below), and
# with the omission of the pattern components marked as "obsolete".
import re
import smtplib
import logging
import socket
try:
raw_input
except NameError:
def raw_input(prompt=''):
return input(prompt)
class ServerError(Exception):
pass
try:
import DNS
ServerError = DNS.ServerError
DNS.DiscoverNameServers()
except (ImportError, AttributeError):
DNS = None
class ServerError(Exception):
pass
# All we are really doing is comparing the input string to one
# gigantic regular expression. But building that regexp, and
# ensuring its correctness, is made much easier by assembling it
# from the "tokens" defined by the RFC. Each of these tokens is
# tested in the accompanying unit test file.
#
# The section of RFC 2822 from which each pattern component is
# derived is given in an accompanying comment.
#
# (To make things simple, every string below is given as 'raw',
# even when it's not strictly necessary. This way we don't forget
# when it is necessary.)
#
WSP = r'[\s]' # see 2.2.2. Structured Header Field Bodies
CRLF = r'(?:\r\n)' # see 2.2.3. Long Header Fields
NO_WS_CTL = r'\x01-\x08\x0b\x0c\x0f-\x1f\x7f' # see 3.2.1. Primitive Tokens
QUOTED_PAIR = r'(?:\\.)' # see 3.2.2. Quoted characters
FWS = r'(?:(?:' + WSP + r'*' + CRLF + r')?' + \
WSP + r'+)' # see 3.2.3. Folding white space and comments
CTEXT = r'[' + NO_WS_CTL + \
r'\x21-\x27\x2a-\x5b\x5d-\x7e]' # see 3.2.3
CCONTENT = r'(?:' + CTEXT + r'|' + \
QUOTED_PAIR + r')' # see 3.2.3 (NB: The RFC includes COMMENT here
# as well, but that would be circular.)
COMMENT = r'\((?:' + FWS + r'?' + CCONTENT + \
r')*' + FWS + r'?\)' # see 3.2.3
CFWS = r'(?:' + FWS + r'?' + COMMENT + ')*(?:' + \
FWS + '?' + COMMENT + '|' + FWS + ')' # see 3.2.3
ATEXT = r'[\w!#$%&\'\*\+\-/=\?\^`\{\|\}~]' # see 3.2.4. Atom
ATOM = CFWS + r'?' + ATEXT + r'+' + CFWS + r'?' # see 3.2.4
DOT_ATOM_TEXT = ATEXT + r'+(?:\.' + ATEXT + r'+)*' # see 3.2.4
DOT_ATOM = CFWS + r'?' + DOT_ATOM_TEXT + CFWS + r'?' # see 3.2.4
QTEXT = r'[' + NO_WS_CTL + \
r'\x21\x23-\x5b\x5d-\x7e]' # see 3.2.5. Quoted strings
QCONTENT = r'(?:' + QTEXT + r'|' + \
QUOTED_PAIR + r')' # see 3.2.5
QUOTED_STRING = CFWS + r'?' + r'"(?:' + FWS + \
r'?' + QCONTENT + r')*' + FWS + \
r'?' + r'"' + CFWS + r'?'
LOCAL_PART = r'(?:' + DOT_ATOM + r'|' + \
QUOTED_STRING + r')' # see 3.4.1. Addr-spec specification
DTEXT = r'[' + NO_WS_CTL + r'\x21-\x5a\x5e-\x7e]' # see 3.4.1
DCONTENT = r'(?:' + DTEXT + r'|' + \
QUOTED_PAIR + r')' # see 3.4.1
DOMAIN_LITERAL = CFWS + r'?' + r'\[' + \
r'(?:' + FWS + r'?' + DCONTENT + \
r')*' + FWS + r'?\]' + CFWS + r'?' # see 3.4.1
DOMAIN = r'(?:' + DOT_ATOM + r'|' + \
DOMAIN_LITERAL + r')' # see 3.4.1
ADDR_SPEC = LOCAL_PART + r'@' + DOMAIN # see 3.4.1
# A valid address will match exactly the 3.4.1 addr-spec.
VALID_ADDRESS_REGEXP = '^' + ADDR_SPEC + '$'
MX_DNS_CACHE = {}
MX_CHECK_CACHE = {}
@ -123,56 +57,57 @@ def validate_email(email, check_mx=False, verify=False, debug=False, smtp_timeou
else:
logger = None
try:
assert re.match(VALID_ADDRESS_REGEXP, email) is not None
check_mx |= verify
if check_mx:
if not DNS:
raise Exception('For check the mx records or check if the email exists you must '
'have installed pyDNS python package')
hostname = email[email.find('@') + 1:]
mx_hosts = get_mx_ip(hostname)
if mx_hosts is None:
return False
for mx in mx_hosts:
try:
if not verify and mx[1] in MX_CHECK_CACHE:
return MX_CHECK_CACHE[mx[1]]
smtp = smtplib.SMTP(timeout=smtp_timeout)
smtp.connect(mx[1])
MX_CHECK_CACHE[mx[1]] = True
if not verify:
try:
if not email_has_valid_structure(email_address):
return False
check_mx |= verify
if check_mx:
if not DNS:
raise Exception('For check the mx records or check if the email exists you must '
'have installed pyDNS python package')
hostname = email[email.find('@') + 1:]
mx_hosts = get_mx_ip(hostname)
if mx_hosts is None:
return False
for mx in mx_hosts:
try:
if not verify and mx[1] in MX_CHECK_CACHE:
return MX_CHECK_CACHE[mx[1]]
smtp = smtplib.SMTP(timeout=smtp_timeout)
smtp.connect(mx[1])
MX_CHECK_CACHE[mx[1]] = True
if not verify:
try:
smtp.quit()
except smtplib.SMTPServerDisconnected:
pass
return True
status, _ = smtp.helo()
if status != 250:
smtp.quit()
except smtplib.SMTPServerDisconnected:
pass
return True
status, _ = smtp.helo()
if status != 250:
smtp.quit()
if debug:
logger.debug(u'%s answer: %s - %s', mx[1], status, _)
continue
smtp.mail('')
status, _ = smtp.rcpt(email)
if status == 250:
smtp.quit()
return True
if debug:
logger.debug(u'%s answer: %s - %s', mx[1], status, _)
continue
smtp.mail('')
status, _ = smtp.rcpt(email)
if status == 250:
smtp.quit()
return True
if debug:
logger.debug(u'%s answer: %s - %s', mx[1], status, _)
smtp.quit()
except smtplib.SMTPServerDisconnected: # Server not permits verify user
if debug:
logger.debug(u'%s disconected.', mx[1])
except smtplib.SMTPConnectError:
if debug:
logger.debug(u'Unable to connect to %s.', mx[1])
return None
except AssertionError:
return False
except (ServerError, socket.error) as e:
if debug:
logger.debug('ServerError or socket.error exception raised (%s).', e)
except smtplib.SMTPServerDisconnected: # Server not permits verify user
if debug:
logger.debug(u'%s disconected.', mx[1])
except smtplib.SMTPConnectError:
if debug:
logger.debug(u'Unable to connect to %s.', mx[1])
return None
except AssertionError:
return False
except (ServerError, socket.error) as e:
if debug:
logger.debug('ServerError or socket.error exception raised (%s).', e)
return None
return True