formparser.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. import typing as t
  2. from functools import update_wrapper
  3. from io import BytesIO
  4. from itertools import chain
  5. from typing import Union
  6. from . import exceptions
  7. from .datastructures import FileStorage
  8. from .datastructures import Headers
  9. from .datastructures import MultiDict
  10. from .http import parse_options_header
  11. from .sansio.multipart import Data
  12. from .sansio.multipart import Epilogue
  13. from .sansio.multipart import Field
  14. from .sansio.multipart import File
  15. from .sansio.multipart import MultipartDecoder
  16. from .sansio.multipart import NeedData
  17. from .urls import url_decode_stream
  18. from .wsgi import _make_chunk_iter
  19. from .wsgi import get_content_length
  20. from .wsgi import get_input_stream
  21. # there are some platforms where SpooledTemporaryFile is not available.
  22. # In that case we need to provide a fallback.
  23. try:
  24. from tempfile import SpooledTemporaryFile
  25. except ImportError:
  26. from tempfile import TemporaryFile
  27. SpooledTemporaryFile = None # type: ignore
  28. if t.TYPE_CHECKING:
  29. import typing as te
  30. from _typeshed.wsgi import WSGIEnvironment
  31. t_parse_result = t.Tuple[t.IO[bytes], MultiDict, MultiDict]
  32. class TStreamFactory(te.Protocol):
  33. def __call__(
  34. self,
  35. total_content_length: t.Optional[int],
  36. content_type: t.Optional[str],
  37. filename: t.Optional[str],
  38. content_length: t.Optional[int] = None,
  39. ) -> t.IO[bytes]:
  40. ...
  41. F = t.TypeVar("F", bound=t.Callable[..., t.Any])
  42. def _exhaust(stream: t.IO[bytes]) -> None:
  43. bts = stream.read(64 * 1024)
  44. while bts:
  45. bts = stream.read(64 * 1024)
  46. def default_stream_factory(
  47. total_content_length: t.Optional[int],
  48. content_type: t.Optional[str],
  49. filename: t.Optional[str],
  50. content_length: t.Optional[int] = None,
  51. ) -> t.IO[bytes]:
  52. max_size = 1024 * 500
  53. if SpooledTemporaryFile is not None:
  54. return t.cast(t.IO[bytes], SpooledTemporaryFile(max_size=max_size, mode="rb+"))
  55. elif total_content_length is None or total_content_length > max_size:
  56. return t.cast(t.IO[bytes], TemporaryFile("rb+"))
  57. return BytesIO()
  58. def parse_form_data(
  59. environ: "WSGIEnvironment",
  60. stream_factory: t.Optional["TStreamFactory"] = None,
  61. charset: str = "utf-8",
  62. errors: str = "replace",
  63. max_form_memory_size: t.Optional[int] = None,
  64. max_content_length: t.Optional[int] = None,
  65. cls: t.Optional[t.Type[MultiDict]] = None,
  66. silent: bool = True,
  67. ) -> "t_parse_result":
  68. """Parse the form data in the environ and return it as tuple in the form
  69. ``(stream, form, files)``. You should only call this method if the
  70. transport method is `POST`, `PUT`, or `PATCH`.
  71. If the mimetype of the data transmitted is `multipart/form-data` the
  72. files multidict will be filled with `FileStorage` objects. If the
  73. mimetype is unknown the input stream is wrapped and returned as first
  74. argument, else the stream is empty.
  75. This is a shortcut for the common usage of :class:`FormDataParser`.
  76. Have a look at :doc:`/request_data` for more details.
  77. .. versionadded:: 0.5
  78. The `max_form_memory_size`, `max_content_length` and
  79. `cls` parameters were added.
  80. .. versionadded:: 0.5.1
  81. The optional `silent` flag was added.
  82. :param environ: the WSGI environment to be used for parsing.
  83. :param stream_factory: An optional callable that returns a new read and
  84. writeable file descriptor. This callable works
  85. the same as :meth:`Response._get_file_stream`.
  86. :param charset: The character set for URL and url encoded form data.
  87. :param errors: The encoding error behavior.
  88. :param max_form_memory_size: the maximum number of bytes to be accepted for
  89. in-memory stored form data. If the data
  90. exceeds the value specified an
  91. :exc:`~exceptions.RequestEntityTooLarge`
  92. exception is raised.
  93. :param max_content_length: If this is provided and the transmitted data
  94. is longer than this value an
  95. :exc:`~exceptions.RequestEntityTooLarge`
  96. exception is raised.
  97. :param cls: an optional dict class to use. If this is not specified
  98. or `None` the default :class:`MultiDict` is used.
  99. :param silent: If set to False parsing errors will not be caught.
  100. :return: A tuple in the form ``(stream, form, files)``.
  101. """
  102. return FormDataParser(
  103. stream_factory,
  104. charset,
  105. errors,
  106. max_form_memory_size,
  107. max_content_length,
  108. cls,
  109. silent,
  110. ).parse_from_environ(environ)
  111. def exhaust_stream(f: F) -> F:
  112. """Helper decorator for methods that exhausts the stream on return."""
  113. def wrapper(self, stream, *args, **kwargs): # type: ignore
  114. try:
  115. return f(self, stream, *args, **kwargs)
  116. finally:
  117. exhaust = getattr(stream, "exhaust", None)
  118. if exhaust is not None:
  119. exhaust()
  120. else:
  121. while True:
  122. chunk = stream.read(1024 * 64)
  123. if not chunk:
  124. break
  125. return update_wrapper(t.cast(F, wrapper), f)
  126. class FormDataParser:
  127. """This class implements parsing of form data for Werkzeug. By itself
  128. it can parse multipart and url encoded form data. It can be subclassed
  129. and extended but for most mimetypes it is a better idea to use the
  130. untouched stream and expose it as separate attributes on a request
  131. object.
  132. .. versionadded:: 0.8
  133. :param stream_factory: An optional callable that returns a new read and
  134. writeable file descriptor. This callable works
  135. the same as :meth:`Response._get_file_stream`.
  136. :param charset: The character set for URL and url encoded form data.
  137. :param errors: The encoding error behavior.
  138. :param max_form_memory_size: the maximum number of bytes to be accepted for
  139. in-memory stored form data. If the data
  140. exceeds the value specified an
  141. :exc:`~exceptions.RequestEntityTooLarge`
  142. exception is raised.
  143. :param max_content_length: If this is provided and the transmitted data
  144. is longer than this value an
  145. :exc:`~exceptions.RequestEntityTooLarge`
  146. exception is raised.
  147. :param cls: an optional dict class to use. If this is not specified
  148. or `None` the default :class:`MultiDict` is used.
  149. :param silent: If set to False parsing errors will not be caught.
  150. """
  151. def __init__(
  152. self,
  153. stream_factory: t.Optional["TStreamFactory"] = None,
  154. charset: str = "utf-8",
  155. errors: str = "replace",
  156. max_form_memory_size: t.Optional[int] = None,
  157. max_content_length: t.Optional[int] = None,
  158. cls: t.Optional[t.Type[MultiDict]] = None,
  159. silent: bool = True,
  160. ) -> None:
  161. if stream_factory is None:
  162. stream_factory = default_stream_factory
  163. self.stream_factory = stream_factory
  164. self.charset = charset
  165. self.errors = errors
  166. self.max_form_memory_size = max_form_memory_size
  167. self.max_content_length = max_content_length
  168. if cls is None:
  169. cls = MultiDict
  170. self.cls = cls
  171. self.silent = silent
  172. def get_parse_func(
  173. self, mimetype: str, options: t.Dict[str, str]
  174. ) -> t.Optional[
  175. t.Callable[
  176. ["FormDataParser", t.IO[bytes], str, t.Optional[int], t.Dict[str, str]],
  177. "t_parse_result",
  178. ]
  179. ]:
  180. return self.parse_functions.get(mimetype)
  181. def parse_from_environ(self, environ: "WSGIEnvironment") -> "t_parse_result":
  182. """Parses the information from the environment as form data.
  183. :param environ: the WSGI environment to be used for parsing.
  184. :return: A tuple in the form ``(stream, form, files)``.
  185. """
  186. content_type = environ.get("CONTENT_TYPE", "")
  187. content_length = get_content_length(environ)
  188. mimetype, options = parse_options_header(content_type)
  189. return self.parse(get_input_stream(environ), mimetype, content_length, options)
  190. def parse(
  191. self,
  192. stream: t.IO[bytes],
  193. mimetype: str,
  194. content_length: t.Optional[int],
  195. options: t.Optional[t.Dict[str, str]] = None,
  196. ) -> "t_parse_result":
  197. """Parses the information from the given stream, mimetype,
  198. content length and mimetype parameters.
  199. :param stream: an input stream
  200. :param mimetype: the mimetype of the data
  201. :param content_length: the content length of the incoming data
  202. :param options: optional mimetype parameters (used for
  203. the multipart boundary for instance)
  204. :return: A tuple in the form ``(stream, form, files)``.
  205. """
  206. if (
  207. self.max_content_length is not None
  208. and content_length is not None
  209. and content_length > self.max_content_length
  210. ):
  211. # if the input stream is not exhausted, firefox reports Connection Reset
  212. _exhaust(stream)
  213. raise exceptions.RequestEntityTooLarge()
  214. if options is None:
  215. options = {}
  216. parse_func = self.get_parse_func(mimetype, options)
  217. if parse_func is not None:
  218. try:
  219. return parse_func(self, stream, mimetype, content_length, options)
  220. except ValueError:
  221. if not self.silent:
  222. raise
  223. return stream, self.cls(), self.cls()
  224. @exhaust_stream
  225. def _parse_multipart(
  226. self,
  227. stream: t.IO[bytes],
  228. mimetype: str,
  229. content_length: t.Optional[int],
  230. options: t.Dict[str, str],
  231. ) -> "t_parse_result":
  232. parser = MultiPartParser(
  233. self.stream_factory,
  234. self.charset,
  235. self.errors,
  236. max_form_memory_size=self.max_form_memory_size,
  237. cls=self.cls,
  238. )
  239. boundary = options.get("boundary", "").encode("ascii")
  240. if not boundary:
  241. raise ValueError("Missing boundary")
  242. form, files = parser.parse(stream, boundary, content_length)
  243. return stream, form, files
  244. @exhaust_stream
  245. def _parse_urlencoded(
  246. self,
  247. stream: t.IO[bytes],
  248. mimetype: str,
  249. content_length: t.Optional[int],
  250. options: t.Dict[str, str],
  251. ) -> "t_parse_result":
  252. if (
  253. self.max_form_memory_size is not None
  254. and content_length is not None
  255. and content_length > self.max_form_memory_size
  256. ):
  257. # if the input stream is not exhausted, firefox reports Connection Reset
  258. _exhaust(stream)
  259. raise exceptions.RequestEntityTooLarge()
  260. form = url_decode_stream(stream, self.charset, errors=self.errors, cls=self.cls)
  261. return stream, form, self.cls()
  262. #: mapping of mimetypes to parsing functions
  263. parse_functions: t.Dict[
  264. str,
  265. t.Callable[
  266. ["FormDataParser", t.IO[bytes], str, t.Optional[int], t.Dict[str, str]],
  267. "t_parse_result",
  268. ],
  269. ] = {
  270. "multipart/form-data": _parse_multipart,
  271. "application/x-www-form-urlencoded": _parse_urlencoded,
  272. "application/x-url-encoded": _parse_urlencoded,
  273. }
  274. def _line_parse(line: str) -> t.Tuple[str, bool]:
  275. """Removes line ending characters and returns a tuple (`stripped_line`,
  276. `is_terminated`).
  277. """
  278. if line[-2:] == "\r\n":
  279. return line[:-2], True
  280. elif line[-1:] in {"\r", "\n"}:
  281. return line[:-1], True
  282. return line, False
  283. class MultiPartParser:
  284. def __init__(
  285. self,
  286. stream_factory: t.Optional["TStreamFactory"] = None,
  287. charset: str = "utf-8",
  288. errors: str = "replace",
  289. max_form_memory_size: t.Optional[int] = None,
  290. cls: t.Optional[t.Type[MultiDict]] = None,
  291. buffer_size: int = 64 * 1024,
  292. ) -> None:
  293. self.charset = charset
  294. self.errors = errors
  295. self.max_form_memory_size = max_form_memory_size
  296. if stream_factory is None:
  297. stream_factory = default_stream_factory
  298. self.stream_factory = stream_factory
  299. if cls is None:
  300. cls = MultiDict
  301. self.cls = cls
  302. self.buffer_size = buffer_size
  303. def fail(self, message: str) -> "te.NoReturn":
  304. raise ValueError(message)
  305. def get_part_charset(self, headers: Headers) -> str:
  306. # Figure out input charset for current part
  307. content_type = headers.get("content-type")
  308. if content_type:
  309. mimetype, ct_params = parse_options_header(content_type)
  310. return ct_params.get("charset", self.charset)
  311. return self.charset
  312. def start_file_streaming(
  313. self, event: File, total_content_length: t.Optional[int]
  314. ) -> t.IO[bytes]:
  315. content_type = event.headers.get("content-type")
  316. try:
  317. content_length = int(event.headers["content-length"])
  318. except (KeyError, ValueError):
  319. content_length = 0
  320. container = self.stream_factory(
  321. total_content_length=total_content_length,
  322. filename=event.filename,
  323. content_type=content_type,
  324. content_length=content_length,
  325. )
  326. return container
  327. def parse(
  328. self, stream: t.IO[bytes], boundary: bytes, content_length: t.Optional[int]
  329. ) -> t.Tuple[MultiDict, MultiDict]:
  330. container: t.Union[t.IO[bytes], t.List[bytes]]
  331. _write: t.Callable[[bytes], t.Any]
  332. iterator = chain(
  333. _make_chunk_iter(
  334. stream,
  335. limit=content_length,
  336. buffer_size=self.buffer_size,
  337. ),
  338. [None],
  339. )
  340. parser = MultipartDecoder(boundary, self.max_form_memory_size)
  341. fields = []
  342. files = []
  343. current_part: Union[Field, File]
  344. for data in iterator:
  345. parser.receive_data(data)
  346. event = parser.next_event()
  347. while not isinstance(event, (Epilogue, NeedData)):
  348. if isinstance(event, Field):
  349. current_part = event
  350. container = []
  351. _write = container.append
  352. elif isinstance(event, File):
  353. current_part = event
  354. container = self.start_file_streaming(event, content_length)
  355. _write = container.write
  356. elif isinstance(event, Data):
  357. _write(event.data)
  358. if not event.more_data:
  359. if isinstance(current_part, Field):
  360. value = b"".join(container).decode(
  361. self.get_part_charset(current_part.headers), self.errors
  362. )
  363. fields.append((current_part.name, value))
  364. else:
  365. container = t.cast(t.IO[bytes], container)
  366. container.seek(0)
  367. files.append(
  368. (
  369. current_part.name,
  370. FileStorage(
  371. container,
  372. current_part.filename,
  373. current_part.name,
  374. headers=current_part.headers,
  375. ),
  376. )
  377. )
  378. event = parser.next_event()
  379. return self.cls(fields), self.cls(files)