multipart.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. import re
  2. from dataclasses import dataclass
  3. from enum import auto
  4. from enum import Enum
  5. from typing import cast
  6. from typing import List
  7. from typing import Optional
  8. from typing import Tuple
  9. from .._internal import _to_bytes
  10. from .._internal import _to_str
  11. from ..datastructures import Headers
  12. from ..exceptions import RequestEntityTooLarge
  13. from ..http import parse_options_header
  14. class Event:
  15. pass
  16. @dataclass(frozen=True)
  17. class Preamble(Event):
  18. data: bytes
  19. @dataclass(frozen=True)
  20. class Field(Event):
  21. name: str
  22. headers: Headers
  23. @dataclass(frozen=True)
  24. class File(Event):
  25. name: str
  26. filename: str
  27. headers: Headers
  28. @dataclass(frozen=True)
  29. class Data(Event):
  30. data: bytes
  31. more_data: bool
  32. @dataclass(frozen=True)
  33. class Epilogue(Event):
  34. data: bytes
  35. class NeedData(Event):
  36. pass
  37. NEED_DATA = NeedData()
  38. class State(Enum):
  39. PREAMBLE = auto()
  40. PART = auto()
  41. DATA = auto()
  42. EPILOGUE = auto()
  43. COMPLETE = auto()
  44. # Multipart line breaks MUST be CRLF (\r\n) by RFC-7578, except that
  45. # many implementations break this and either use CR or LF alone.
  46. LINE_BREAK = b"(?:\r\n|\n|\r)"
  47. BLANK_LINE_RE = re.compile(b"(?:\r\n\r\n|\r\r|\n\n)", re.MULTILINE)
  48. LINE_BREAK_RE = re.compile(LINE_BREAK, re.MULTILINE)
  49. # Header values can be continued via a space or tab after the linebreak, as
  50. # per RFC2231
  51. HEADER_CONTINUATION_RE = re.compile(b"%s[ \t]" % LINE_BREAK, re.MULTILINE)
  52. # This must be long enough to contain any line breaks plus any
  53. # additional boundary markers (--) such that they will be found in a
  54. # subsequent search
  55. SEARCH_EXTRA_LENGTH = 8
  56. class MultipartDecoder:
  57. """Decodes a multipart message as bytes into Python events.
  58. The part data is returned as available to allow the caller to save
  59. the data from memory to disk, if desired.
  60. """
  61. def __init__(
  62. self,
  63. boundary: bytes,
  64. max_form_memory_size: Optional[int] = None,
  65. ) -> None:
  66. self.buffer = bytearray()
  67. self.complete = False
  68. self.max_form_memory_size = max_form_memory_size
  69. self.state = State.PREAMBLE
  70. self.boundary = boundary
  71. # Note in the below \h i.e. horizontal whitespace is used
  72. # as [^\S\n\r] as \h isn't supported in python.
  73. # The preamble must end with a boundary where the boundary is
  74. # prefixed by a line break, RFC2046. Except that many
  75. # implementations including Werkzeug's tests omit the line
  76. # break prefix. In addition the first boundary could be the
  77. # epilogue boundary (for empty form-data) hence the matching
  78. # group to understand if it is an epilogue boundary.
  79. self.preamble_re = re.compile(
  80. rb"%s?--%s(--[^\S\n\r]*%s?|[^\S\n\r]*%s)"
  81. % (LINE_BREAK, re.escape(boundary), LINE_BREAK, LINE_BREAK),
  82. re.MULTILINE,
  83. )
  84. # A boundary must include a line break prefix and suffix, and
  85. # may include trailing whitespace. In addition the boundary
  86. # could be the epilogue boundary hence the matching group to
  87. # understand if it is an epilogue boundary.
  88. self.boundary_re = re.compile(
  89. rb"%s--%s(--[^\S\n\r]*%s?|[^\S\n\r]*%s)"
  90. % (LINE_BREAK, re.escape(boundary), LINE_BREAK, LINE_BREAK),
  91. re.MULTILINE,
  92. )
  93. self._search_position = 0
  94. def last_newline(self) -> int:
  95. try:
  96. last_nl = self.buffer.rindex(b"\n")
  97. except ValueError:
  98. last_nl = len(self.buffer)
  99. try:
  100. last_cr = self.buffer.rindex(b"\r")
  101. except ValueError:
  102. last_cr = len(self.buffer)
  103. return min(last_nl, last_cr)
  104. def receive_data(self, data: Optional[bytes]) -> None:
  105. if data is None:
  106. self.complete = True
  107. elif (
  108. self.max_form_memory_size is not None
  109. and len(self.buffer) + len(data) > self.max_form_memory_size
  110. ):
  111. raise RequestEntityTooLarge()
  112. else:
  113. self.buffer.extend(data)
  114. def next_event(self) -> Event:
  115. event: Event = NEED_DATA
  116. if self.state == State.PREAMBLE:
  117. match = self.preamble_re.search(self.buffer, self._search_position)
  118. if match is not None:
  119. if match.group(1).startswith(b"--"):
  120. self.state = State.EPILOGUE
  121. else:
  122. self.state = State.PART
  123. data = bytes(self.buffer[: match.start()])
  124. del self.buffer[: match.end()]
  125. event = Preamble(data=data)
  126. self._search_position = 0
  127. else:
  128. # Update the search start position to be equal to the
  129. # current buffer length (already searched) minus a
  130. # safe buffer for part of the search target.
  131. self._search_position = max(
  132. 0, len(self.buffer) - len(self.boundary) - SEARCH_EXTRA_LENGTH
  133. )
  134. elif self.state == State.PART:
  135. match = BLANK_LINE_RE.search(self.buffer, self._search_position)
  136. if match is not None:
  137. headers = self._parse_headers(self.buffer[: match.start()])
  138. del self.buffer[: match.end()]
  139. if "content-disposition" not in headers:
  140. raise ValueError("Missing Content-Disposition header")
  141. disposition, extra = parse_options_header(
  142. headers["content-disposition"]
  143. )
  144. name = cast(str, extra.get("name"))
  145. filename = extra.get("filename")
  146. if filename is not None:
  147. event = File(
  148. filename=filename,
  149. headers=headers,
  150. name=name,
  151. )
  152. else:
  153. event = Field(
  154. headers=headers,
  155. name=name,
  156. )
  157. self.state = State.DATA
  158. self._search_position = 0
  159. else:
  160. # Update the search start position to be equal to the
  161. # current buffer length (already searched) minus a
  162. # safe buffer for part of the search target.
  163. self._search_position = max(0, len(self.buffer) - SEARCH_EXTRA_LENGTH)
  164. elif self.state == State.DATA:
  165. if self.buffer.find(b"--" + self.boundary) == -1:
  166. # No complete boundary in the buffer, but there may be
  167. # a partial boundary at the end. As the boundary
  168. # starts with either a nl or cr find the earliest and
  169. # return up to that as data.
  170. data_length = del_index = self.last_newline()
  171. more_data = True
  172. else:
  173. match = self.boundary_re.search(self.buffer)
  174. if match is not None:
  175. if match.group(1).startswith(b"--"):
  176. self.state = State.EPILOGUE
  177. else:
  178. self.state = State.PART
  179. data_length = match.start()
  180. del_index = match.end()
  181. else:
  182. data_length = del_index = self.last_newline()
  183. more_data = match is None
  184. data = bytes(self.buffer[:data_length])
  185. del self.buffer[:del_index]
  186. if data or not more_data:
  187. event = Data(data=data, more_data=more_data)
  188. elif self.state == State.EPILOGUE and self.complete:
  189. event = Epilogue(data=bytes(self.buffer))
  190. del self.buffer[:]
  191. self.state = State.COMPLETE
  192. if self.complete and isinstance(event, NeedData):
  193. raise ValueError(f"Invalid form-data cannot parse beyond {self.state}")
  194. return event
  195. def _parse_headers(self, data: bytes) -> Headers:
  196. headers: List[Tuple[str, str]] = []
  197. # Merge the continued headers into one line
  198. data = HEADER_CONTINUATION_RE.sub(b" ", data)
  199. # Now there is one header per line
  200. for line in data.splitlines():
  201. if line.strip() != b"":
  202. name, value = _to_str(line).strip().split(":", 1)
  203. headers.append((name.strip(), value.strip()))
  204. return Headers(headers)
  205. class MultipartEncoder:
  206. def __init__(self, boundary: bytes) -> None:
  207. self.boundary = boundary
  208. self.state = State.PREAMBLE
  209. def send_event(self, event: Event) -> bytes:
  210. if isinstance(event, Preamble) and self.state == State.PREAMBLE:
  211. self.state = State.PART
  212. return event.data
  213. elif isinstance(event, (Field, File)) and self.state in {
  214. State.PREAMBLE,
  215. State.PART,
  216. State.DATA,
  217. }:
  218. self.state = State.DATA
  219. data = b"\r\n--" + self.boundary + b"\r\n"
  220. data += b'Content-Disposition: form-data; name="%s"' % _to_bytes(event.name)
  221. if isinstance(event, File):
  222. data += b'; filename="%s"' % _to_bytes(event.filename)
  223. data += b"\r\n"
  224. for name, value in cast(Field, event).headers:
  225. if name.lower() != "content-disposition":
  226. data += _to_bytes(f"{name}: {value}\r\n")
  227. data += b"\r\n"
  228. return data
  229. elif isinstance(event, Data) and self.state == State.DATA:
  230. return event.data
  231. elif isinstance(event, Epilogue):
  232. self.state = State.COMPLETE
  233. return b"\r\n--" + self.boundary + b"--\r\n" + event.data
  234. else:
  235. raise ValueError(f"Cannot generate {event} in state: {self.state}")