multipart.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. import re
  2. from dataclasses import dataclass
  3. from enum import auto
  4. from enum import Enum
  5. from typing import cast
  6. from typing import List
  7. from typing import Optional
  8. from typing import Tuple
  9. from .._internal import _to_bytes
  10. from .._internal import _to_str
  11. from ..datastructures import Headers
  12. from ..exceptions import RequestEntityTooLarge
  13. from ..http import parse_options_header
  14. class Event:
  15. pass
  16. @dataclass(frozen=True)
  17. class Preamble(Event):
  18. data: bytes
  19. @dataclass(frozen=True)
  20. class Field(Event):
  21. name: str
  22. headers: Headers
  23. @dataclass(frozen=True)
  24. class File(Event):
  25. name: str
  26. filename: str
  27. headers: Headers
  28. @dataclass(frozen=True)
  29. class Data(Event):
  30. data: bytes
  31. more_data: bool
  32. @dataclass(frozen=True)
  33. class Epilogue(Event):
  34. data: bytes
  35. class NeedData(Event):
  36. pass
  37. NEED_DATA = NeedData()
  38. class State(Enum):
  39. PREAMBLE = auto()
  40. PART = auto()
  41. DATA = auto()
  42. EPILOGUE = auto()
  43. COMPLETE = auto()
  44. # Multipart line breaks MUST be CRLF (\r\n) by RFC-7578, except that
  45. # many implementations break this and either use CR or LF alone.
  46. LINE_BREAK = b"(?:\r\n|\n|\r)"
  47. BLANK_LINE_RE = re.compile(b"(?:\r\n\r\n|\r\r|\n\n)", re.MULTILINE)
  48. LINE_BREAK_RE = re.compile(LINE_BREAK, re.MULTILINE)
  49. # Header values can be continued via a space or tab after the linebreak, as
  50. # per RFC2231
  51. HEADER_CONTINUATION_RE = re.compile(b"%s[ \t]" % LINE_BREAK, re.MULTILINE)
  52. class MultipartDecoder:
  53. """Decodes a multipart message as bytes into Python events.
  54. The part data is returned as available to allow the caller to save
  55. the data from memory to disk, if desired.
  56. """
  57. def __init__(
  58. self,
  59. boundary: bytes,
  60. max_form_memory_size: Optional[int] = None,
  61. ) -> None:
  62. self.buffer = bytearray()
  63. self.complete = False
  64. self.max_form_memory_size = max_form_memory_size
  65. self.state = State.PREAMBLE
  66. self.boundary = boundary
  67. # Note in the below \h i.e. horizontal whitespace is used
  68. # as [^\S\n\r] as \h isn't supported in python.
  69. # The preamble must end with a boundary where the boundary is
  70. # prefixed by a line break, RFC2046. Except that many
  71. # implementations including Werkzeug's tests omit the line
  72. # break prefix. In addition the first boundary could be the
  73. # epilogue boundary (for empty form-data) hence the matching
  74. # group to understand if it is an epilogue boundary.
  75. self.preamble_re = re.compile(
  76. rb"%s?--%s(--[^\S\n\r]*%s?|[^\S\n\r]*%s)"
  77. % (LINE_BREAK, re.escape(boundary), LINE_BREAK, LINE_BREAK),
  78. re.MULTILINE,
  79. )
  80. # A boundary must include a line break prefix and suffix, and
  81. # may include trailing whitespace. In addition the boundary
  82. # could be the epilogue boundary hence the matching group to
  83. # understand if it is an epilogue boundary.
  84. self.boundary_re = re.compile(
  85. rb"%s--%s(--[^\S\n\r]*%s?|[^\S\n\r]*%s)"
  86. % (LINE_BREAK, re.escape(boundary), LINE_BREAK, LINE_BREAK),
  87. re.MULTILINE,
  88. )
  89. def last_newline(self) -> int:
  90. try:
  91. last_nl = self.buffer.rindex(b"\n")
  92. except ValueError:
  93. last_nl = len(self.buffer)
  94. try:
  95. last_cr = self.buffer.rindex(b"\r")
  96. except ValueError:
  97. last_cr = len(self.buffer)
  98. return min(last_nl, last_cr)
  99. def receive_data(self, data: Optional[bytes]) -> None:
  100. if data is None:
  101. self.complete = True
  102. elif (
  103. self.max_form_memory_size is not None
  104. and len(self.buffer) + len(data) > self.max_form_memory_size
  105. ):
  106. raise RequestEntityTooLarge()
  107. else:
  108. self.buffer.extend(data)
  109. def next_event(self) -> Event:
  110. event: Event = NEED_DATA
  111. if self.state == State.PREAMBLE:
  112. match = self.preamble_re.search(self.buffer)
  113. if match is not None:
  114. if match.group(1).startswith(b"--"):
  115. self.state = State.EPILOGUE
  116. else:
  117. self.state = State.PART
  118. data = bytes(self.buffer[: match.start()])
  119. del self.buffer[: match.end()]
  120. event = Preamble(data=data)
  121. elif self.state == State.PART:
  122. match = BLANK_LINE_RE.search(self.buffer)
  123. if match is not None:
  124. headers = self._parse_headers(self.buffer[: match.start()])
  125. del self.buffer[: match.end()]
  126. if "content-disposition" not in headers:
  127. raise ValueError("Missing Content-Disposition header")
  128. disposition, extra = parse_options_header(
  129. headers["content-disposition"]
  130. )
  131. name = cast(str, extra.get("name"))
  132. filename = extra.get("filename")
  133. if filename is not None:
  134. event = File(
  135. filename=filename,
  136. headers=headers,
  137. name=name,
  138. )
  139. else:
  140. event = Field(
  141. headers=headers,
  142. name=name,
  143. )
  144. self.state = State.DATA
  145. elif self.state == State.DATA:
  146. if self.buffer.find(b"--" + self.boundary) == -1:
  147. # No complete boundary in the buffer, but there may be
  148. # a partial boundary at the end. As the boundary
  149. # starts with either a nl or cr find the earliest and
  150. # return up to that as data.
  151. data_length = del_index = self.last_newline()
  152. more_data = True
  153. else:
  154. match = self.boundary_re.search(self.buffer)
  155. if match is not None:
  156. if match.group(1).startswith(b"--"):
  157. self.state = State.EPILOGUE
  158. else:
  159. self.state = State.PART
  160. data_length = match.start()
  161. del_index = match.end()
  162. else:
  163. data_length = del_index = self.last_newline()
  164. more_data = match is None
  165. data = bytes(self.buffer[:data_length])
  166. del self.buffer[:del_index]
  167. if data or not more_data:
  168. event = Data(data=data, more_data=more_data)
  169. elif self.state == State.EPILOGUE and self.complete:
  170. event = Epilogue(data=bytes(self.buffer))
  171. del self.buffer[:]
  172. self.state = State.COMPLETE
  173. if self.complete and isinstance(event, NeedData):
  174. raise ValueError(f"Invalid form-data cannot parse beyond {self.state}")
  175. return event
  176. def _parse_headers(self, data: bytes) -> Headers:
  177. headers: List[Tuple[str, str]] = []
  178. # Merge the continued headers into one line
  179. data = HEADER_CONTINUATION_RE.sub(b" ", data)
  180. # Now there is one header per line
  181. for line in data.splitlines():
  182. if line.strip() != b"":
  183. name, value = _to_str(line).strip().split(":", 1)
  184. headers.append((name.strip(), value.strip()))
  185. return Headers(headers)
  186. class MultipartEncoder:
  187. def __init__(self, boundary: bytes) -> None:
  188. self.boundary = boundary
  189. self.state = State.PREAMBLE
  190. def send_event(self, event: Event) -> bytes:
  191. if isinstance(event, Preamble) and self.state == State.PREAMBLE:
  192. self.state = State.PART
  193. return event.data
  194. elif isinstance(event, (Field, File)) and self.state in {
  195. State.PREAMBLE,
  196. State.PART,
  197. State.DATA,
  198. }:
  199. self.state = State.DATA
  200. data = b"\r\n--" + self.boundary + b"\r\n"
  201. data += b'Content-Disposition: form-data; name="%s"' % _to_bytes(event.name)
  202. if isinstance(event, File):
  203. data += b'; filename="%s"' % _to_bytes(event.filename)
  204. data += b"\r\n"
  205. for name, value in cast(Field, event).headers:
  206. if name.lower() != "content-disposition":
  207. data += _to_bytes(f"{name}: {value}\r\n")
  208. data += b"\r\n"
  209. return data
  210. elif isinstance(event, Data) and self.state == State.DATA:
  211. return event.data
  212. elif isinstance(event, Epilogue):
  213. self.state = State.COMPLETE
  214. return b"\r\n--" + self.boundary + b"--\r\n" + event.data
  215. else:
  216. raise ValueError(f"Cannot generate {event} in state: {self.state}")