Skip to content

Commit e520f20

Browse files
committed
Refactor parser
This fixes inconsistencies reported after the release of version 0.10.0: * Valid escapes were interpreted as control characters even when in single-quoted strings. * `#` was interpreted as the start of a comment even if there was no whitespace preceding it. However, we are keeping the interpretation of escapes in double-quoted strings as they didn't make sense in versions before 0.10.0. The single large regular expression is replaced with a handwritten top-down parser using smaller regular expressions. The reason for this change is that it would have been very difficult or impossible to satisfy the parsing requirements with a single regex.
1 parent fd0a487 commit e520f20

File tree

4 files changed

+149
-68
lines changed

4 files changed

+149
-68
lines changed

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -299,8 +299,9 @@ Changelog
299299
Unreleased
300300
-----
301301

302-
- ...
303-
302+
- Refactor parser to fix parsing inconsistencies ([@bbc2])([#170]).
303+
- Interpret escapes as control characters only in double-quoted strings.
304+
- Interpret `#` as start of comment only if preceded by whitespace.
304305

305306
0.10.2
306307
-----
@@ -428,6 +429,7 @@ Unreleased
428429
[#172]: https://github.com/theskumar/python-dotenv/issues/172
429430
[#121]: https://github.com/theskumar/python-dotenv/issues/121
430431
[#176]: https://github.com/theskumar/python-dotenv/issues/176
432+
[#170]: https://github.com/theskumar/python-dotenv/issues/170
431433

432434
[@asyncee]: https://github.com/asyncee
433435
[@greyli]: https://github.com/greyli

src/dotenv/compat.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,23 @@
11
import sys
2+
from typing import Text
23

34
if sys.version_info >= (3, 0):
45
from io import StringIO # noqa
56
else:
67
from StringIO import StringIO # noqa
78

89
PY2 = sys.version_info[0] == 2 # type: bool
10+
11+
12+
def to_text(string):
13+
# type: (str) -> Text
14+
"""
15+
Make a string Unicode if it isn't already.
16+
17+
This is useful for defining raw unicode strings because `ur"foo"` isn't valid in
18+
Python 3.
19+
"""
20+
if PY2:
21+
return string.decode("utf-8")
22+
else:
23+
return string

src/dotenv/parser.py

Lines changed: 122 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,82 +1,147 @@
11
import codecs
22
import re
33
from typing import (IO, Iterator, Match, NamedTuple, Optional, Pattern, # noqa
4-
Text, Tuple)
4+
Sequence, Text)
55

6-
_binding = re.compile(
7-
r"""
8-
(
9-
\s* # leading whitespace
10-
(?:export{0}+)? # export
6+
from .compat import to_text
117

12-
( '[^']+' # single-quoted key
13-
| [^=\#\s]+ # or unquoted key
14-
)?
158

16-
(?:
17-
(?:{0}*={0}*) # equal sign
9+
def make_regex(string, extra_flags=0):
10+
# type: (str, int) -> Pattern[Text]
11+
return re.compile(to_text(string), re.UNICODE | extra_flags)
1812

19-
( '(?:\\'|[^'])*' # single-quoted value
20-
| "(?:\\"|[^"])*" # or double-quoted value
21-
| [^\#\r\n]* # or unquoted value
22-
)
23-
)?
24-
25-
\s* # trailing whitespace
26-
(?:\#[^\r\n]*)? # comment
27-
(?:\r|\n|\r\n)? # newline
28-
)
29-
""".format(r'[^\S\r\n]'),
30-
re.MULTILINE | re.VERBOSE,
31-
) # type: Pattern[Text]
32-
33-
_escape_sequence = re.compile(r"\\[\\'\"abfnrtv]") # type: Pattern[Text]
3413

14+
_whitespace = make_regex(r"\s*", extra_flags=re.MULTILINE)
15+
_export = make_regex(r"(?:export[^\S\r\n]+)?")
16+
_single_quoted_key = make_regex(r"'([^']+)'")
17+
_unquoted_key = make_regex(r"([^=\#\s]+)")
18+
_equal_sign = make_regex(r"[^\S\r\n]*=[^\S\r\n]*")
19+
_single_quoted_value = make_regex(r"'((?:\\'|[^'])*)'")
20+
_double_quoted_value = make_regex(r'"((?:\\"|[^"])*)"')
21+
_unquoted_value_part = make_regex(r"([^ \r\n]*)")
22+
_comment = make_regex(r"(?:\s*#[^\r\n]*)?")
23+
_end_of_line = make_regex(r"[^\S\r\n]*(?:\r\n|\n|\r)?")
24+
_rest_of_line = make_regex(r"[^\r\n]*(?:\r|\n|\r\n)?")
25+
_double_quote_escapes = make_regex(r"\\[\\'\"abfnrtv]")
26+
_single_quote_escapes = make_regex(r"\\[\\']")
3527

3628
Binding = NamedTuple("Binding", [("key", Optional[Text]),
3729
("value", Optional[Text]),
3830
("original", Text)])
3931

4032

41-
def decode_escapes(string):
42-
# type: (Text) -> Text
33+
class Error(Exception):
34+
pass
35+
36+
37+
class Reader:
38+
def __init__(self, stream):
39+
# type: (IO[Text]) -> None
40+
self.string = stream.read()
41+
self.position = 0
42+
self.mark = 0
43+
44+
def has_next(self):
45+
# type: () -> bool
46+
return self.position < len(self.string)
47+
48+
def set_mark(self):
49+
# type: () -> None
50+
self.mark = self.position
51+
52+
def get_marked(self):
53+
# type: () -> Text
54+
return self.string[self.mark:self.position]
55+
56+
def peek(self, count):
57+
# type: (int) -> Text
58+
return self.string[self.position:self.position + count]
59+
60+
def read(self, count):
61+
# type: (int) -> Text
62+
result = self.string[self.position:self.position + count]
63+
if len(result) < count:
64+
raise Error("read: End of string")
65+
self.position += count
66+
return result
67+
68+
def read_regex(self, regex):
69+
# type: (Pattern[Text]) -> Sequence[Text]
70+
match = regex.match(self.string, self.position)
71+
if match is None:
72+
raise Error("read_regex: Pattern not found")
73+
self.position = match.end()
74+
return match.groups()
75+
76+
77+
def decode_escapes(regex, string):
78+
# type: (Pattern[Text], Text) -> Text
4379
def decode_match(match):
4480
# type: (Match[Text]) -> Text
4581
return codecs.decode(match.group(0), 'unicode-escape') # type: ignore
4682

47-
return _escape_sequence.sub(decode_match, string)
83+
return regex.sub(decode_match, string)
4884

4985

50-
def is_surrounded_by(string, char):
51-
# type: (Text, Text) -> bool
52-
return (
53-
len(string) > 1
54-
and string[0] == string[-1] == char
55-
)
56-
57-
58-
def parse_binding(string, position):
59-
# type: (Text, int) -> Tuple[Binding, int]
60-
match = _binding.match(string, position)
61-
assert match is not None
62-
(matched, key, value) = match.groups()
63-
if key is None or value is None:
64-
key = None
65-
value = None
86+
def parse_key(reader):
87+
# type: (Reader) -> Text
88+
char = reader.peek(1)
89+
if char == "'":
90+
(key,) = reader.read_regex(_single_quoted_key)
91+
else:
92+
(key,) = reader.read_regex(_unquoted_key)
93+
return key
94+
95+
96+
def parse_unquoted_value(reader):
97+
# type: (Reader) -> Text
98+
value = u""
99+
while True:
100+
(part,) = reader.read_regex(_unquoted_value_part)
101+
value += part
102+
after = reader.peek(2)
103+
if len(after) < 2 or after[0] in u"\r\n" or after[1] in u" #\r\n":
104+
return value
105+
value += reader.read(2)
106+
107+
108+
def parse_value(reader):
109+
# type: (Reader) -> Text
110+
char = reader.peek(1)
111+
if char == u"'":
112+
(value,) = reader.read_regex(_single_quoted_value)
113+
return decode_escapes(_single_quote_escapes, value)
114+
elif char == u'"':
115+
(value,) = reader.read_regex(_double_quoted_value)
116+
return decode_escapes(_double_quote_escapes, value)
117+
elif char in (u"", u"\n", u"\r"):
118+
return u""
66119
else:
67-
value_quoted = is_surrounded_by(value, "'") or is_surrounded_by(value, '"')
68-
if value_quoted:
69-
value = decode_escapes(value[1:-1])
70-
else:
71-
value = value.strip()
72-
return (Binding(key=key, value=value, original=matched), match.end())
120+
return parse_unquoted_value(reader)
121+
122+
123+
def parse_binding(reader):
124+
# type: (Reader) -> Binding
125+
reader.set_mark()
126+
try:
127+
reader.read_regex(_whitespace)
128+
reader.read_regex(_export)
129+
key = parse_key(reader)
130+
reader.read_regex(_equal_sign)
131+
value = parse_value(reader)
132+
reader.read_regex(_comment)
133+
reader.read_regex(_end_of_line)
134+
return Binding(key=key, value=value, original=reader.get_marked())
135+
except Error:
136+
reader.read_regex(_rest_of_line)
137+
return Binding(key=None, value=None, original=reader.get_marked())
73138

74139

75140
def parse_stream(stream):
76141
# type:(IO[Text]) -> Iterator[Binding]
77-
string = stream.read()
78-
position = 0
79-
length = len(string)
80-
while position < length:
81-
(binding, position) = parse_binding(string, position)
82-
yield binding
142+
reader = Reader(stream)
143+
while reader.has_next():
144+
try:
145+
yield parse_binding(reader)
146+
except Error:
147+
return

tests/test_parser.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,13 @@
88
@pytest.mark.parametrize("test_input,expected", [
99
(u"", []),
1010
(u"a=b", [Binding(key=u"a", value=u"b", original=u"a=b")]),
11-
(u"'a'=b", [Binding(key=u"'a'", value=u"b", original=u"'a'=b")]),
11+
(u"'a'=b", [Binding(key=u"a", value=u"b", original=u"'a'=b")]),
1212
(u"[=b", [Binding(key=u"[", value=u"b", original=u"[=b")]),
1313
(u" a = b ", [Binding(key=u"a", value=u"b", original=u" a = b ")]),
1414
(u"export a=b", [Binding(key=u"a", value=u"b", original=u"export a=b")]),
15-
(u" export 'a'=b", [Binding(key=u"'a'", value=u"b", original=u" export 'a'=b")]),
16-
(u" export 'a'=b", [Binding(key=u"'a'", value=u"b", original=u" export 'a'=b")]),
15+
(u" export 'a'=b", [Binding(key=u"a", value=u"b", original=u" export 'a'=b")]),
1716
(u"# a=b", [Binding(key=None, value=None, original=u"# a=b")]),
18-
(u"a=b#c", [Binding(key=u"a", value=u"b", original=u"a=b#c")]),
17+
(u"a=b#c", [Binding(key=u"a", value=u"b#c", original=u"a=b#c")]),
1918
(u'a=b # comment', [Binding(key=u"a", value=u"b", original=u"a=b # comment")]),
2019
(u"a=b space ", [Binding(key=u"a", value=u"b space", original=u"a=b space ")]),
2120
(u"a='b space '", [Binding(key=u"a", value=u"b space ", original=u"a='b space '")]),
@@ -26,7 +25,7 @@
2625
(u"a='b\nc'", [Binding(key=u"a", value=u"b\nc", original=u"a='b\nc'")]),
2726
(u'a="b\nc"', [Binding(key=u"a", value=u"b\nc", original=u'a="b\nc"')]),
2827
(u'a="b\\nc"', [Binding(key=u"a", value=u'b\nc', original=u'a="b\\nc"')]),
29-
(u"a='b\\nc'", [Binding(key=u"a", value=u'b\nc', original=u"a='b\\nc'")]),
28+
(u"a='b\\nc'", [Binding(key=u"a", value=u'b\\nc', original=u"a='b\\nc'")]),
3029
(u'a="b\\"c"', [Binding(key=u"a", value=u'b"c', original=u'a="b\\"c"')]),
3130
(u"a='b\\'c'", [Binding(key=u"a", value=u"b'c", original=u"a='b\\'c'")]),
3231
(u"a=à", [Binding(key=u"a", value=u"à", original=u"a=à")]),
@@ -56,15 +55,15 @@
5655
(
5756
u'a=b\n\nc=d',
5857
[
59-
Binding(key=u"a", value=u"b", original=u"a=b\n\n"),
60-
Binding(key=u"c", value=u"d", original=u"c=d"),
58+
Binding(key=u"a", value=u"b", original=u"a=b\n"),
59+
Binding(key=u"c", value=u"d", original=u"\nc=d"),
6160
]
6261
),
6362
(
6463
u'a="\nb=c',
6564
[
66-
Binding(key=u"a", value=u'"', original=u'a="\n'),
67-
Binding(key=u"b", value=u'c', original=u"b=c"),
65+
Binding(key=None, value=None, original=u'a="\n'),
66+
Binding(key=u"b", value=u"c", original=u"b=c"),
6867
]
6968
),
7069
(

0 commit comments

Comments
 (0)