1 /* 2 * Archttp - A highly performant web framework written in D. 3 * 4 * Copyright (C) 2021-2022 Kerisy.com 5 * 6 * Website: https://www.kerisy.com 7 * 8 * Licensed under the Apache-2.0 License. 9 * 10 */ 11 12 module archttp.HttpMessageParser; 13 14 nothrow @nogc: 15 16 /// Parser error codes 17 enum ParserError : int 18 { 19 partial = 1, /// not enough data to parse message 20 newLine, /// invalid character in new line 21 headerName, /// invalid character in header name 22 headerValue, /// invalid header value 23 status, /// invalid character in response status 24 token, /// invalid character in token 25 noHeaderName, /// empty header name 26 noMethod, /// no method in request line 27 noVersion, /// no version in request line / response status line 28 noUri, /// no URI in request line 29 noStatus, /// no status code or text in status line 30 invalidMethod, /// invalid method in request line 31 invalidVersion, /// invalid version for the protocol message 32 } 33 34 struct HttpRequestHeader 35 { 36 const(char)[] name; 37 const(char)[] value; 38 } 39 40 interface HttpMessageHandler 41 { 42 void onMethod(const(char)[] method); 43 void onUri(const(char)[] uri); 44 int onVersion(const(char)[] ver); 45 void onHeader(const(char)[] name, const(char)[] value); 46 void onStatus(int status); 47 void onStatusMsg(const(char)[] statusMsg); 48 } 49 50 /** 51 * HTTP/RTSP message parser. 52 */ 53 class HttpMessageParser 54 { 55 import std.traits : ForeachType, isArray, Unqual; 56 57 this(HttpMessageHandler handler) 58 { 59 this._messageHandler = handler; 60 } 61 62 /** 63 * Parses message request (request line + headers). 64 * 65 * Params: 66 * - buffer = buffer to parse message from 67 * - lastPos = optional argument to store / pass previous position to which message was 68 * already parsed (speeds up parsing when message comes in parts) 69 * 70 * Returns: 71 * * parsed message header length when parsed sucessfully 72 * * `-ParserError` on error (ie. -1 when message header is not complete yet) 73 */ 74 long parseRequest(T)(T buffer, ref ulong lastPos) 75 if (isArray!T && (is(Unqual!(ForeachType!T) == char) || is(Unqual!(ForeachType!T) == ubyte))) 76 { 77 static if (is(Unqual!(ForeachType!T) == char)) return parse!parseRequestLine(cast(const(ubyte)[])buffer, lastPos); 78 else return parse!parseRequestLine(buffer, lastPos); 79 } 80 81 /// ditto 82 long parseRequest(T)(T buffer) 83 if (isArray!T && (is(Unqual!(ForeachType!T) == char) || is(Unqual!(ForeachType!T) == ubyte))) 84 { 85 ulong lastPos; 86 static if (is(Unqual!(ForeachType!T) == char)) return parse!parseRequestLine(cast(const(ubyte)[])buffer, lastPos); 87 else return parse!parseRequestLine(buffer, lastPos); 88 } 89 90 /** 91 * Parses message response (status line + headers). 92 * 93 * Params: 94 * - buffer = buffer to parse message from 95 * - lastPos = optional argument to store / pass previous position to which message was 96 * already parsed (speeds up parsing when message comes in parts) 97 * 98 * Returns: 99 * * parsed message header length when parsed sucessfully 100 * * `-ParserError.partial` on error (ie. -1 when message header is not comlete yet) 101 */ 102 long parseResponse(T)(T buffer, ref ulong lastPos) 103 if (isArray!T && (is(Unqual!(ForeachType!T) == char) || is(Unqual!(ForeachType!T) == ubyte))) 104 { 105 static if (is(Unqual!(ForeachType!T) == char)) return parse!parseStatusLine(cast(const(ubyte)[])buffer, lastPos); 106 else return parse!parseStatusLine(buffer, lastPos); 107 } 108 109 /// ditto 110 int parseResponse(T)(T buffer) 111 if (isArray!T && (is(Unqual!(ForeachType!T) == char) || is(Unqual!(ForeachType!T) == ubyte))) 112 { 113 ulong lastPos; 114 static if (is(Unqual!(ForeachType!T) == char)) return parse!parseStatusLine(cast(const(ubyte)[])buffer, lastPos); 115 else return parse!parseStatusLine(buffer, lastPos); 116 } 117 118 /// Gets provided structure used during parsing 119 HttpMessageHandler messageHandler() { return _messageHandler; } 120 121 private: 122 123 // character map of valid characters for token, forbidden: 124 // 0-SP, DEL, HT 125 // ()<>@,;:\"/[]?={} 126 enum tokenRanges = "\0 \"\"(),,//:@[]{}\x7f\xff"; 127 enum tokenSSERanges = "\0 \"\"(),,//:@[]{\xff"; // merge of last range due to the SSE register size limit 128 129 enum versionRanges = "\0-:@[`{\xff"; // allow only [A-Za-z./] characters 130 131 HttpMessageHandler _messageHandler; 132 133 long parse(alias pred)(const(ubyte)[] buffer, ref ulong lastPos) 134 { 135 assert(buffer.length >= lastPos); 136 immutable l = buffer.length; 137 138 if (_expect(!lastPos, true)) 139 { 140 if (_expect(!buffer.length, false)) return err(ParserError.partial); 141 142 // skip first empty line (some clients add CRLF after POST content) 143 if (_expect(buffer[0] == '\r', false)) 144 { 145 if (_expect(buffer.length == 1, false)) return err(ParserError.partial); 146 if (_expect(buffer[1] != '\n', false)) return err(ParserError.newLine); 147 lastPos += 2; 148 buffer = buffer[lastPos..$]; 149 } 150 else if (_expect(buffer[0] == '\n', false)) 151 buffer = buffer[++lastPos..$]; 152 153 immutable res = pred(buffer); 154 if (_expect(res < 0, false)) return res; 155 156 lastPos = cast(int)(l - buffer.length); // store index of last parsed line 157 } 158 else buffer = buffer[lastPos..$]; // skip already parsed lines 159 160 immutable hdrRes = parseHeaders(buffer); 161 lastPos = cast(int)(l - buffer.length); // store index of last parsed line 162 163 if (_expect(hdrRes < 0, false)) return hdrRes; 164 return lastPos; // finished 165 } 166 167 int parseHeaders(ref const(ubyte)[] buffer) 168 { 169 bool hasHeader; 170 size_t start, i; 171 const(ubyte)[] name, value; 172 while (true) 173 { 174 // check for msg headers end 175 if (_expect(buffer.length == 0, false)) return err(ParserError.partial); 176 if (buffer[0] == '\r') 177 { 178 if (_expect(buffer.length == 1, false)) return err(ParserError.partial); 179 if (_expect(buffer[1] != '\n', false)) return err(ParserError.newLine); 180 181 buffer = buffer[2..$]; 182 return 0; 183 } 184 if (_expect(buffer[0] == '\n', false)) 185 { 186 buffer = buffer[1..$]; 187 return 0; 188 } 189 190 if (!hasHeader || (buffer[i] != ' ' && buffer[i] != '\t')) 191 { 192 auto ret = parseToken!(tokenRanges, ':', tokenSSERanges)(buffer, i); 193 if (_expect(ret < 0, false)) return ret; 194 if (_expect(start == i, false)) return err(ParserError.noHeaderName); 195 name = buffer[start..i]; // store header name 196 i++; // move index after colon 197 198 // skip over SP and HT 199 for (;; ++i) 200 { 201 if (_expect(i == buffer.length, false)) return err(ParserError.partial); 202 if (buffer[i] != ' ' && buffer[i] != '\t') break; 203 } 204 start = i; 205 } 206 else name = null; // multiline header 207 208 // parse value 209 auto ret = parseToken!("\0\010\012\037\177\177", "\r\n")(buffer, i); 210 if (_expect(ret < 0, false)) return ret; 211 value = buffer[start..i]; 212 mixin(advanceNewline); 213 hasHeader = true; // flag to define that we can now accept multiline header values 214 215 // remove trailing SPs and HTABs 216 if (_expect(value.length && (value[$-1] == ' ' || value[$-1] == '\t'), false)) 217 { 218 int j = cast(int)value.length - 2; 219 for (; j >= 0; --j) 220 if (!(value[j] == ' ' || value[j] == '\t')) 221 break; 222 value = value[0..j+1]; 223 } 224 225 static if (is(typeof(_messageHandler.onHeader("", "")) == void)) 226 _messageHandler.onHeader(cast(const(char)[])name, cast(const(char)[])value); 227 else { 228 auto r = _messageHandler.onHeader(cast(const(char)[])name, cast(const(char)[])value); 229 if (_expect(r < 0, false)) return r; 230 } 231 232 // header line completed -> advance buffer 233 buffer = buffer[i..$]; 234 start = i = 0; 235 } 236 assert(0); 237 } 238 239 auto parseRequestLine(ref const(ubyte)[] buffer) 240 { 241 size_t start, i; 242 243 // METHOD 244 auto ret = parseToken!(tokenRanges, ' ', tokenSSERanges)(buffer, i); 245 if (_expect(ret < 0, false)) return ret; 246 if (_expect(start == i, false)) return err(ParserError.noMethod); 247 248 static if (is(typeof(_messageHandler.onMethod("")) == void)) 249 _messageHandler.onMethod(cast(const(char)[])buffer[start..i]); 250 else { 251 auto r = _messageHandler.onMethod(cast(const(char)[])buffer[start..i]); 252 if (_expect(r < 0, false)) return r; 253 } 254 255 mixin(skipSpaces!(ParserError.noUri)); 256 start = i; 257 258 // PATH 259 ret = parseToken!("\000\040\177\177", ' ')(buffer, i); 260 if (_expect(ret < 0, false)) return ret; 261 262 static if (is(typeof(_messageHandler.onUri("")) == void)) 263 _messageHandler.onUri(cast(const(char)[])buffer[start..i]); 264 else { 265 auto ur = _messageHandler.onUri(cast(const(char)[])buffer[start..i]); 266 if (_expect(ur < 0, false)) return ur; 267 } 268 269 mixin(skipSpaces!(ParserError.noVersion)); 270 start = i; 271 272 // VERSION 273 ret = parseToken!(versionRanges, "\r\n")(buffer, i); 274 if (_expect(ret < 0, false)) return ret; 275 276 static if (is(typeof(_messageHandler.onVersion("")) == void)) 277 _messageHandler.onVersion(cast(const(char)[])buffer[start..i]); 278 else { 279 auto vr = _messageHandler.onVersion(cast(const(char)[])buffer[start..i]); 280 if (_expect(vr < 0, false)) return vr; 281 } 282 283 mixin(advanceNewline); 284 285 // advance buffer after the request line 286 buffer = buffer[i..$]; 287 return 0; 288 } 289 290 auto parseStatusLine(ref const(ubyte)[] buffer) 291 { 292 size_t start, i; 293 294 // VERSION 295 auto ret = parseToken!(versionRanges, ' ')(buffer, i); 296 if (_expect(ret < 0, false)) return ret; 297 if (_expect(start == i, false)) return err(ParserError.noVersion); 298 299 static if (is(typeof(_messageHandler.onVersion("")) == void)) 300 _messageHandler.onVersion(cast(const(char)[])buffer[start..i]); 301 else { 302 auto r = _messageHandler.onVersion(cast(const(char)[])buffer[start..i]); 303 if (_expect(r < 0, false)) return r; 304 } 305 306 mixin(skipSpaces!(ParserError.noStatus)); 307 start = i; 308 309 // STATUS CODE 310 if (_expect(i+3 >= buffer.length, false)) 311 return err(ParserError.partial); // not enough data - we want at least [:digit:][:digit:][:digit:]<other char> to try to parse 312 313 int code; 314 foreach (j, m; [100, 10, 1]) 315 { 316 if (buffer[i+j] < '0' || buffer[i+j] > '9') return err(ParserError.status); 317 code += (buffer[start+j] - '0') * m; 318 } 319 i += 3; 320 321 static if (is(typeof(_messageHandler.onStatus(code)) == void)) 322 _messageHandler.onStatus(code); 323 else { 324 auto sr = _messageHandler.onStatus(code); 325 if (_expect(sr < 0, false)) return sr; 326 } 327 328 if (_expect(i == buffer.length, false)) 329 return err(ParserError.partial); 330 if (_expect(buffer[i] != ' ' && buffer[i] != '\r' && buffer[i] != '\n', false)) 331 return err(ParserError.status); // Garbage after status 332 333 start = i; 334 335 // MESSAGE 336 ret = parseToken!("\0\010\012\037\177\177", "\r\n")(buffer, i); 337 if (_expect(ret < 0, false)) return ret; 338 339 // remove preceding space (we did't advance over spaces because possibly missing status message) 340 if (i > start) 341 { 342 while (buffer[start] == ' ' && start < i) start++; 343 if (i > start) 344 { 345 static if (is(typeof(_messageHandler.onStatusMsg("")) == void)) 346 _messageHandler.onStatusMsg(cast(const(char)[])buffer[start..i]); 347 else { 348 auto smr = _messageHandler.onStatusMsg(cast(const(char)[])buffer[start..i]); 349 if (_expect(smr < 0, false)) return smr; 350 } 351 } 352 } 353 354 mixin(advanceNewline); 355 356 // advance buffer after the status line 357 buffer = buffer[i..$]; 358 return 0; 359 } 360 361 /* 362 * Advances buffer over the token to the next character while checking for valid characters. 363 * On success, buffer index is left on the next character. 364 * 365 * Params: 366 * - ranges = ranges of characters to stop on 367 * - sseRanges = if null, same ranges is used, but they are limited to 8 ranges 368 * - next = next character/s to stop on (must be present in the provided ranges too) 369 * Returns: 0 on success error code otherwise 370 */ 371 int parseToken(string ranges, alias next, string sseRanges = null)(const(ubyte)[] buffer, ref size_t i) pure 372 { 373 version (DigitalMars) { 374 static if (__VERSION__ >= 2094) pragma(inline, true); // older compilers can't inline this 375 } else pragma(inline, true); 376 377 immutable charMap = parseTokenCharMap!(ranges)(); 378 379 static if (LDC_with_SSE42) 380 { 381 // CT function to prepare input for SIMD vector enum 382 static byte[16] padRanges()(string ranges) 383 { 384 byte[16] res; 385 // res[0..ranges.length] = cast(byte[])ranges[]; - broken on macOS betterC tests 386 foreach (i, c; ranges) res[i] = cast(byte)c; 387 return res; 388 } 389 390 static if (sseRanges) alias usedRng = sseRanges; 391 else alias usedRng = ranges; 392 static assert(usedRng.length <= 16, "Ranges must be at most 16 characters long"); 393 static assert(usedRng.length % 2 == 0, "Ranges must have even number of characters"); 394 enum rangesSize = usedRng.length; 395 enum byte16 rngE = padRanges(usedRng); 396 397 if (_expect(buffer.length - i >= 16, true)) 398 { 399 size_t left = (buffer.length - i) & ~15; // round down to multiple of 16 400 byte16 ranges16 = rngE; 401 402 do 403 { 404 byte16 b16 = () @trusted { return cast(byte16)_mm_loadu_si128(cast(__m128i*)&buffer[i]); }(); 405 immutable r = _mm_cmpestri( 406 ranges16, rangesSize, 407 b16, 16, 408 _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS 409 ); 410 411 if (r != 16) 412 { 413 i += r; 414 goto FOUND; 415 } 416 i += 16; 417 left -= 16; 418 } 419 while (_expect(left != 0, true)); 420 } 421 } 422 else 423 { 424 // faster unrolled loop to iterate over 8 characters 425 loop: while (_expect(buffer.length - i >= 8, true)) 426 { 427 static foreach (_; 0..8) 428 { 429 if (_expect(!charMap[buffer[i]], false)) goto FOUND; 430 ++i; 431 } 432 } 433 } 434 435 // handle the rest 436 if (_expect(i >= buffer.length, false)) return err(ParserError.partial); 437 438 FOUND: 439 while (true) 440 { 441 static if (is(typeof(next) == char)) { 442 static assert(!charMap[next], "Next character is not in ranges"); 443 if (buffer[i] == next) return 0; 444 } else { 445 static assert(next.length > 0, "Next character not provided"); 446 static foreach (c; next) { 447 static assert(!charMap[c], "Next character is not in ranges"); 448 if (buffer[i] == c) return 0; 449 } 450 } 451 if (_expect(!charMap[buffer[i]], false)) return err(ParserError.token); 452 if (_expect(++i == buffer.length, false)) return err(ParserError.partial); 453 } 454 } 455 456 // advances over new line 457 enum advanceNewline = q{ 458 assert(i < buffer.length); 459 if (_expect(buffer[i] == '\r', true)) 460 { 461 if (_expect(i+1 == buffer.length, false)) return err(ParserError.partial); 462 if (_expect(buffer[i+1] != '\n', false)) return err(ParserError.newLine); 463 i += 2; 464 } 465 else if (buffer[i] == '\n') ++i; 466 else assert(0); 467 }; 468 469 // skips over spaces in the buffer 470 template skipSpaces(ParserError err) 471 { 472 enum skipSpaces = ` 473 do { 474 ++i; 475 if (_expect(buffer.length == i, false)) return err(ParserError.partial); 476 if (_expect(buffer[i] == '\r' || buffer[i] == '\n', false)) return err(` ~ err.stringof ~ `); 477 } while (buffer[i] == ' '); 478 `; 479 } 480 } 481 482 /** 483 * Parses HTTP version from a slice returned in `onVersion` callback. 484 * 485 * Returns: minor version (0 for HTTP/1.0 or 1 for HTTP/1.1) on success or 486 * `-ParserError.invalidVersion` on error 487 */ 488 int parseHttpVersion(const(char)[] ver) pure 489 { 490 if (_expect(ver.length != 8, false)) return err(ParserError.invalidVersion); 491 492 static foreach (i, c; "HTTP/1.") 493 if (_expect(ver[i] != c, false)) return err(ParserError.invalidVersion); 494 495 if (_expect(ver[7] < '0' || ver[7] > '9', false)) return err(ParserError.invalidVersion); 496 497 return ver[7] - '0'; 498 } 499 500 private: 501 502 int err(ParserError e) pure { pragma(inline, true); return -(cast(int)e); } 503 504 /// Builds valid char map from the provided ranges of invalid ones 505 bool[256] buildValidCharMap()(string invalidRanges) 506 { 507 assert(invalidRanges.length % 2 == 0, "Uneven ranges"); 508 bool[256] res = true; 509 510 for (int i=0; i < invalidRanges.length; i+=2) 511 for (int j=invalidRanges[i]; j <= invalidRanges[i+1]; ++j) 512 res[j] = false; 513 return res; 514 } 515 516 immutable(bool[256]) parseTokenCharMap(string invalidRanges)() { 517 static immutable charMap = buildValidCharMap(invalidRanges); 518 return charMap; 519 } 520 521 //** used intrinsics **// 522 523 version(LDC) 524 { 525 public import core.simd; 526 public import ldc.intrinsics; 527 import ldc.gccbuiltins_x86; 528 529 enum LDC_with_SSE42 = __traits(targetHasFeature, "sse4.2"); 530 531 // These specify the type of data that we're comparing. 532 enum _SIDD_UBYTE_OPS = 0x00; 533 enum _SIDD_UWORD_OPS = 0x01; 534 enum _SIDD_SBYTE_OPS = 0x02; 535 enum _SIDD_SWORD_OPS = 0x03; 536 537 // These specify the type of comparison operation. 538 enum _SIDD_CMP_EQUAL_ANY = 0x00; 539 enum _SIDD_CMP_RANGES = 0x04; 540 enum _SIDD_CMP_EQUAL_EACH = 0x08; 541 enum _SIDD_CMP_EQUAL_ORDERED = 0x0c; 542 543 // These are used in _mm_cmpXstri() to specify the return. 544 enum _SIDD_LEAST_SIGNIFICANT = 0x00; 545 enum _SIDD_MOST_SIGNIFICANT = 0x40; 546 547 // These macros are used in _mm_cmpXstri() to specify the return. 548 enum _SIDD_BIT_MASK = 0x00; 549 enum _SIDD_UNIT_MASK = 0x40; 550 551 // some definition aliases to commonly used names 552 alias __m128i = int4; 553 554 // some used methods aliases 555 alias _expect = llvm_expect; 556 alias _mm_loadu_si128 = loadUnaligned!__m128i; 557 alias _mm_cmpestri = __builtin_ia32_pcmpestri128; 558 } 559 else 560 { 561 enum LDC_with_SSE42 = false; 562 563 T _expect(T)(T val, T expected_val) if (__traits(isIntegral, T)) 564 { 565 pragma(inline, true); 566 return val; 567 } 568 } 569 570 pragma(msg, "SSE: ", LDC_with_SSE42);