1 /*
2  * Archttp - A highly performant web framework written in D.
3  *
4  * Copyright (C) 2021-2022 Kerisy.com
5  *
6  * Website: https://www.kerisy.com
7  *
8  * Licensed under the Apache-2.0 License.
9  *
10  */
11 
12 module archttp.HttpMessageParser;
13 
14 nothrow @nogc:
15 
16 /// Parser error codes
17 enum ParserError : int
18 {
19     partial = 1,    /// not enough data to parse message
20     newLine,        /// invalid character in new line
21     headerName,     /// invalid character in header name
22     headerValue,    /// invalid header value
23     status,         /// invalid character in response status
24     token,          /// invalid character in token
25     noHeaderName,   /// empty header name
26     noMethod,       /// no method in request line
27     noVersion,      /// no version in request line / response status line
28     noUri,          /// no URI in request line
29     noStatus,       /// no status code or text in status line
30     invalidMethod,  /// invalid method in request line
31     invalidVersion, /// invalid version for the protocol message
32 }
33 
34 struct HttpRequestHeader
35 {
36     const(char)[] name;
37     const(char)[] value;
38 }
39 
40 interface HttpMessageHandler
41 {
42     void onMethod(const(char)[] method);
43     void onUri(const(char)[] uri);
44     int onVersion(const(char)[] ver);
45     void onHeader(const(char)[] name, const(char)[] value);
46     void onStatus(int status);
47     void onStatusMsg(const(char)[] statusMsg);
48 }
49 
50 /**
51  *  HTTP/RTSP message parser.
52  */
53 class HttpMessageParser
54 {
55     import std.traits : ForeachType, isArray, Unqual;
56 
57     this(HttpMessageHandler handler)
58     {
59         this._messageHandler = handler;
60     }
61 
62     /**
63      *  Parses message request (request line + headers).
64      *
65      *  Params:
66      *    - buffer = buffer to parse message from
67      *    - lastPos = optional argument to store / pass previous position to which message was
68      *                already parsed (speeds up parsing when message comes in parts)
69      *
70      *  Returns:
71      *    * parsed message header length when parsed sucessfully
72      *    * `-ParserError` on error (ie. -1 when message header is not complete yet)
73      */
74     long parseRequest(T)(T buffer, ref ulong lastPos)
75         if (isArray!T && (is(Unqual!(ForeachType!T) == char) || is(Unqual!(ForeachType!T) == ubyte)))
76     {
77         static if (is(Unqual!(ForeachType!T) == char)) return parse!parseRequestLine(cast(const(ubyte)[])buffer, lastPos);
78         else return parse!parseRequestLine(buffer, lastPos);
79     }
80 
81     /// ditto
82     long parseRequest(T)(T buffer)
83         if (isArray!T && (is(Unqual!(ForeachType!T) == char) || is(Unqual!(ForeachType!T) == ubyte)))
84     {
85         ulong lastPos;
86         static if (is(Unqual!(ForeachType!T) == char)) return parse!parseRequestLine(cast(const(ubyte)[])buffer, lastPos);
87         else return parse!parseRequestLine(buffer, lastPos);
88     }
89 
90     /**
91      *  Parses message response (status line + headers).
92      *
93      *  Params:
94      *    - buffer = buffer to parse message from
95      *    - lastPos = optional argument to store / pass previous position to which message was
96      *                already parsed (speeds up parsing when message comes in parts)
97      *
98      *  Returns:
99      *    * parsed message header length when parsed sucessfully
100      *    * `-ParserError.partial` on error (ie. -1 when message header is not comlete yet)
101      */
102     long parseResponse(T)(T buffer, ref ulong lastPos)
103         if (isArray!T && (is(Unqual!(ForeachType!T) == char) || is(Unqual!(ForeachType!T) == ubyte)))
104     {
105         static if (is(Unqual!(ForeachType!T) == char)) return parse!parseStatusLine(cast(const(ubyte)[])buffer, lastPos);
106         else return parse!parseStatusLine(buffer, lastPos);
107     }
108 
109     /// ditto
110     int parseResponse(T)(T buffer)
111         if (isArray!T && (is(Unqual!(ForeachType!T) == char) || is(Unqual!(ForeachType!T) == ubyte)))
112     {
113         ulong lastPos;
114         static if (is(Unqual!(ForeachType!T) == char)) return parse!parseStatusLine(cast(const(ubyte)[])buffer, lastPos);
115         else return parse!parseStatusLine(buffer, lastPos);
116     }
117 
118     /// Gets provided structure used during parsing
119     HttpMessageHandler messageHandler() { return _messageHandler; }
120 
121 private:
122 
123     // character map of valid characters for token, forbidden:
124     //   0-SP, DEL, HT
125     //   ()<>@,;:\"/[]?={}
126     enum tokenRanges = "\0 \"\"(),,//:@[]{}\x7f\xff";
127     enum tokenSSERanges = "\0 \"\"(),,//:@[]{\xff"; // merge of last range due to the SSE register size limit
128 
129     enum versionRanges = "\0-:@[`{\xff"; // allow only [A-Za-z./] characters
130 
131     HttpMessageHandler _messageHandler;
132 
133     long parse(alias pred)(const(ubyte)[] buffer, ref ulong lastPos)
134     {
135         assert(buffer.length >= lastPos);
136         immutable l = buffer.length;
137 
138         if (_expect(!lastPos, true))
139         {
140             if (_expect(!buffer.length, false)) return err(ParserError.partial);
141 
142             // skip first empty line (some clients add CRLF after POST content)
143             if (_expect(buffer[0] == '\r', false))
144             {
145                 if (_expect(buffer.length == 1, false)) return err(ParserError.partial);
146                 if (_expect(buffer[1] != '\n', false)) return err(ParserError.newLine);
147                 lastPos += 2;
148                 buffer = buffer[lastPos..$];
149             }
150             else if (_expect(buffer[0] == '\n', false))
151                 buffer = buffer[++lastPos..$];
152 
153             immutable res = pred(buffer);
154             if (_expect(res < 0, false)) return res;
155 
156             lastPos = cast(int)(l - buffer.length); // store index of last parsed line
157         }
158         else buffer = buffer[lastPos..$]; // skip already parsed lines
159 
160         immutable hdrRes = parseHeaders(buffer);
161         lastPos = cast(int)(l - buffer.length); // store index of last parsed line
162 
163         if (_expect(hdrRes < 0, false)) return hdrRes;
164         return lastPos; // finished
165     }
166 
167     int parseHeaders(ref const(ubyte)[] buffer)
168     {
169         bool hasHeader;
170         size_t start, i;
171         const(ubyte)[] name, value;
172         while (true)
173         {
174             // check for msg headers end
175             if (_expect(buffer.length == 0, false)) return err(ParserError.partial);
176             if (buffer[0] == '\r')
177             {
178                 if (_expect(buffer.length == 1, false)) return err(ParserError.partial);
179                 if (_expect(buffer[1] != '\n', false)) return err(ParserError.newLine);
180 
181                 buffer = buffer[2..$];
182                 return 0;
183             }
184             if (_expect(buffer[0] == '\n', false))
185             {
186                 buffer = buffer[1..$];
187                 return 0;
188             }
189 
190             if (!hasHeader || (buffer[i] != ' ' && buffer[i] != '\t'))
191             {
192                 auto ret = parseToken!(tokenRanges, ':', tokenSSERanges)(buffer, i);
193                 if (_expect(ret < 0, false)) return ret;
194                 if (_expect(start == i, false)) return err(ParserError.noHeaderName);
195                 name = buffer[start..i]; // store header name
196                 i++; // move index after colon
197 
198                 // skip over SP and HT
199                 for (;; ++i)
200                 {
201                     if (_expect(i == buffer.length, false)) return err(ParserError.partial);
202                     if (buffer[i] != ' ' && buffer[i] != '\t') break;
203                 }
204                 start = i;
205             }
206             else name = null; // multiline header
207 
208             // parse value
209             auto ret = parseToken!("\0\010\012\037\177\177", "\r\n")(buffer, i);
210             if (_expect(ret < 0, false)) return ret;
211             value = buffer[start..i];
212             mixin(advanceNewline);
213             hasHeader = true; // flag to define that we can now accept multiline header values
214 
215             // remove trailing SPs and HTABs
216             if (_expect(value.length && (value[$-1] == ' ' || value[$-1] == '\t'), false))
217             {
218                 int j = cast(int)value.length - 2;
219                 for (; j >= 0; --j)
220                     if (!(value[j] == ' ' || value[j] == '\t'))
221                         break;
222                 value = value[0..j+1];
223             }
224 
225             static if (is(typeof(_messageHandler.onHeader("", "")) == void))
226                 _messageHandler.onHeader(cast(const(char)[])name, cast(const(char)[])value);
227             else {
228                 auto r = _messageHandler.onHeader(cast(const(char)[])name, cast(const(char)[])value);
229                 if (_expect(r < 0, false)) return r;
230             }
231 
232             // header line completed -> advance buffer
233             buffer = buffer[i..$];
234             start = i = 0;
235         }
236         assert(0);
237     }
238 
239     auto parseRequestLine(ref const(ubyte)[] buffer)
240     {
241         size_t start, i;
242 
243         // METHOD
244         auto ret = parseToken!(tokenRanges, ' ', tokenSSERanges)(buffer, i);
245         if (_expect(ret < 0, false)) return ret;
246         if (_expect(start == i, false)) return err(ParserError.noMethod);
247 
248         static if (is(typeof(_messageHandler.onMethod("")) == void))
249             _messageHandler.onMethod(cast(const(char)[])buffer[start..i]);
250         else {
251             auto r = _messageHandler.onMethod(cast(const(char)[])buffer[start..i]);
252             if (_expect(r < 0, false)) return r;
253         }
254         
255         mixin(skipSpaces!(ParserError.noUri));
256         start = i;
257 
258         // PATH
259         ret = parseToken!("\000\040\177\177", ' ')(buffer, i);
260         if (_expect(ret < 0, false)) return ret;
261 
262         static if (is(typeof(_messageHandler.onUri("")) == void))
263             _messageHandler.onUri(cast(const(char)[])buffer[start..i]);
264         else {
265             auto ur = _messageHandler.onUri(cast(const(char)[])buffer[start..i]);
266             if (_expect(ur < 0, false)) return ur;
267         }
268         
269         mixin(skipSpaces!(ParserError.noVersion));
270         start = i;
271 
272         // VERSION
273         ret = parseToken!(versionRanges, "\r\n")(buffer, i);
274         if (_expect(ret < 0, false)) return ret;
275 
276         static if (is(typeof(_messageHandler.onVersion("")) == void))
277             _messageHandler.onVersion(cast(const(char)[])buffer[start..i]);
278         else {
279             auto vr = _messageHandler.onVersion(cast(const(char)[])buffer[start..i]);
280             if (_expect(vr < 0, false)) return vr;
281         }
282 
283         mixin(advanceNewline);
284 
285         // advance buffer after the request line
286         buffer = buffer[i..$];
287         return 0;
288     }
289 
290     auto parseStatusLine(ref const(ubyte)[] buffer)
291     {
292         size_t start, i;
293 
294         // VERSION
295         auto ret = parseToken!(versionRanges, ' ')(buffer, i);
296         if (_expect(ret < 0, false)) return ret;
297         if (_expect(start == i, false)) return err(ParserError.noVersion);
298 
299         static if (is(typeof(_messageHandler.onVersion("")) == void))
300             _messageHandler.onVersion(cast(const(char)[])buffer[start..i]);
301         else {
302             auto r = _messageHandler.onVersion(cast(const(char)[])buffer[start..i]);
303             if (_expect(r < 0, false)) return r;
304         }
305 
306         mixin(skipSpaces!(ParserError.noStatus));
307         start = i;
308 
309         // STATUS CODE
310         if (_expect(i+3 >= buffer.length, false))
311             return err(ParserError.partial); // not enough data - we want at least [:digit:][:digit:][:digit:]<other char> to try to parse
312 
313         int code;
314         foreach (j, m; [100, 10, 1])
315         {
316             if (buffer[i+j] < '0' || buffer[i+j] > '9') return err(ParserError.status);
317             code += (buffer[start+j] - '0') * m;
318         }
319         i += 3;
320 
321         static if (is(typeof(_messageHandler.onStatus(code)) == void))
322             _messageHandler.onStatus(code);
323         else {
324             auto sr = _messageHandler.onStatus(code);
325             if (_expect(sr < 0, false)) return sr;
326         }
327 
328         if (_expect(i == buffer.length, false))
329             return err(ParserError.partial);
330         if (_expect(buffer[i] != ' ' && buffer[i] != '\r' && buffer[i] != '\n', false))
331             return err(ParserError.status); // Garbage after status
332 
333         start = i;
334 
335         // MESSAGE
336         ret = parseToken!("\0\010\012\037\177\177", "\r\n")(buffer, i);
337         if (_expect(ret < 0, false)) return ret;
338 
339         // remove preceding space (we did't advance over spaces because possibly missing status message)
340         if (i > start)
341         {
342             while (buffer[start] == ' ' && start < i) start++;
343             if (i > start)
344             {
345                 static if (is(typeof(_messageHandler.onStatusMsg("")) == void))
346                     _messageHandler.onStatusMsg(cast(const(char)[])buffer[start..i]);
347                 else {
348                     auto smr = _messageHandler.onStatusMsg(cast(const(char)[])buffer[start..i]);
349                     if (_expect(smr < 0, false)) return smr;
350                 }
351             }
352         }
353 
354         mixin(advanceNewline);
355 
356         // advance buffer after the status line
357         buffer = buffer[i..$];
358         return 0;
359     }
360 
361     /*
362      * Advances buffer over the token to the next character while checking for valid characters.
363      * On success, buffer index is left on the next character.
364      *
365      * Params:
366      *      - ranges = ranges of characters to stop on
367      *      - sseRanges = if null, same ranges is used, but they are limited to 8 ranges
368      *      - next  = next character/s to stop on (must be present in the provided ranges too)
369      * Returns: 0 on success error code otherwise
370      */
371     int parseToken(string ranges, alias next, string sseRanges = null)(const(ubyte)[] buffer, ref size_t i) pure
372     {
373         version (DigitalMars) {
374             static if (__VERSION__ >= 2094) pragma(inline, true); // older compilers can't inline this
375         } else pragma(inline, true);
376 
377         immutable charMap = parseTokenCharMap!(ranges)();
378 
379         static if (LDC_with_SSE42)
380         {
381             // CT function to prepare input for SIMD vector enum
382             static byte[16] padRanges()(string ranges)
383             {
384                 byte[16] res;
385                 // res[0..ranges.length] = cast(byte[])ranges[]; - broken on macOS betterC tests
386                 foreach (i, c; ranges) res[i] = cast(byte)c;
387                 return res;
388             }
389 
390             static if (sseRanges) alias usedRng = sseRanges;
391             else alias usedRng = ranges;
392             static assert(usedRng.length <= 16, "Ranges must be at most 16 characters long");
393             static assert(usedRng.length % 2 == 0, "Ranges must have even number of characters");
394             enum rangesSize = usedRng.length;
395             enum byte16 rngE = padRanges(usedRng);
396 
397             if (_expect(buffer.length - i >= 16, true))
398             {
399                 size_t left = (buffer.length - i) & ~15; // round down to multiple of 16
400                 byte16 ranges16 = rngE;
401 
402                 do
403                 {
404                     byte16 b16 = () @trusted { return cast(byte16)_mm_loadu_si128(cast(__m128i*)&buffer[i]); }();
405                     immutable r = _mm_cmpestri(
406                         ranges16, rangesSize,
407                         b16, 16,
408                         _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS
409                     );
410 
411                     if (r != 16)
412                     {
413                         i += r;
414                         goto FOUND;
415                     }
416                     i += 16;
417                     left -= 16;
418                 }
419                 while (_expect(left != 0, true));
420             }
421         }
422         else
423         {
424             // faster unrolled loop to iterate over 8 characters
425             loop: while (_expect(buffer.length - i >= 8, true))
426             {
427                 static foreach (_; 0..8)
428                 {
429                     if (_expect(!charMap[buffer[i]], false)) goto FOUND;
430                     ++i;
431                 }
432             }
433         }
434 
435         // handle the rest
436         if (_expect(i >= buffer.length, false)) return err(ParserError.partial);
437 
438         FOUND:
439         while (true)
440         {
441             static if (is(typeof(next) == char)) {
442                 static assert(!charMap[next], "Next character is not in ranges");
443                 if (buffer[i] == next) return 0;
444             } else {
445                 static assert(next.length > 0, "Next character not provided");
446                 static foreach (c; next) {
447                     static assert(!charMap[c], "Next character is not in ranges");
448                     if (buffer[i] == c) return 0;
449                 }
450             }
451             if (_expect(!charMap[buffer[i]], false)) return err(ParserError.token);
452             if (_expect(++i == buffer.length, false)) return err(ParserError.partial);
453         }
454     }
455 
456     // advances over new line
457     enum advanceNewline = q{
458             assert(i < buffer.length);
459             if (_expect(buffer[i] == '\r', true))
460             {
461                 if (_expect(i+1 == buffer.length, false)) return err(ParserError.partial);
462                 if (_expect(buffer[i+1] != '\n', false)) return err(ParserError.newLine);
463                 i += 2;
464             }
465             else if (buffer[i] == '\n') ++i;
466             else assert(0);
467         };
468 
469     // skips over spaces in the buffer
470     template skipSpaces(ParserError err)
471     {
472         enum skipSpaces = `
473             do {
474                 ++i;
475                 if (_expect(buffer.length == i, false)) return err(ParserError.partial);
476                 if (_expect(buffer[i] == '\r' || buffer[i] == '\n', false)) return err(` ~ err.stringof ~ `);
477             } while (buffer[i] == ' ');
478         `;
479     }
480 }
481 
482 /**
483  * Parses HTTP version from a slice returned in `onVersion` callback.
484  *
485  * Returns: minor version (0 for HTTP/1.0 or 1 for HTTP/1.1) on success or
486  *          `-ParserError.invalidVersion` on error
487  */
488 int parseHttpVersion(const(char)[] ver) pure
489 {
490     if (_expect(ver.length != 8, false)) return err(ParserError.invalidVersion);
491 
492     static foreach (i, c; "HTTP/1.")
493         if (_expect(ver[i] != c, false)) return err(ParserError.invalidVersion);
494 
495     if (_expect(ver[7] < '0' || ver[7] > '9', false)) return err(ParserError.invalidVersion);
496     
497     return ver[7] - '0';
498 }
499 
500 private:
501 
502 int err(ParserError e) pure { pragma(inline, true); return -(cast(int)e); }
503 
504 /// Builds valid char map from the provided ranges of invalid ones
505 bool[256] buildValidCharMap()(string invalidRanges)
506 {
507     assert(invalidRanges.length % 2 == 0, "Uneven ranges");
508     bool[256] res = true;
509 
510     for (int i=0; i < invalidRanges.length; i+=2)
511         for (int j=invalidRanges[i]; j <= invalidRanges[i+1]; ++j)
512             res[j] = false;
513     return res;
514 }
515 
516 immutable(bool[256]) parseTokenCharMap(string invalidRanges)() {
517     static immutable charMap = buildValidCharMap(invalidRanges);
518     return charMap;
519 }
520 
521 //** used intrinsics **//
522 
523 version(LDC)
524 {
525     public import core.simd;
526     public import ldc.intrinsics;
527     import ldc.gccbuiltins_x86;
528 
529     enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2");
530 
531     // These specify the type of data that we're comparing.
532     enum _SIDD_UBYTE_OPS            = 0x00;
533     enum _SIDD_UWORD_OPS            = 0x01;
534     enum _SIDD_SBYTE_OPS            = 0x02;
535     enum _SIDD_SWORD_OPS            = 0x03;
536 
537     // These specify the type of comparison operation.
538     enum _SIDD_CMP_EQUAL_ANY        = 0x00;
539     enum _SIDD_CMP_RANGES           = 0x04;
540     enum _SIDD_CMP_EQUAL_EACH       = 0x08;
541     enum _SIDD_CMP_EQUAL_ORDERED    = 0x0c;
542 
543     // These are used in _mm_cmpXstri() to specify the return.
544     enum _SIDD_LEAST_SIGNIFICANT    = 0x00;
545     enum _SIDD_MOST_SIGNIFICANT     = 0x40;
546 
547     // These macros are used in _mm_cmpXstri() to specify the return.
548     enum _SIDD_BIT_MASK             = 0x00;
549     enum _SIDD_UNIT_MASK            = 0x40;
550 
551     // some definition aliases to commonly used names
552     alias __m128i = int4;
553 
554     // some used methods aliases
555     alias _expect = llvm_expect;
556     alias _mm_loadu_si128 = loadUnaligned!__m128i;
557     alias _mm_cmpestri = __builtin_ia32_pcmpestri128;
558 }
559 else
560 {
561     enum LDC_with_SSE42 = false;
562 
563     T _expect(T)(T val, T expected_val) if (__traits(isIntegral, T))
564     {
565         pragma(inline, true);
566         return val;
567     }
568 }
569 
570 pragma(msg, "SSE: ", LDC_with_SSE42);