sniffer.js 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990
  1. "use strict";
  2. Object.defineProperty(exports, "__esModule", { value: true });
  3. exports.Sniffer = exports.STRINGS = exports.ResultType = void 0;
  4. exports.getEncoding = getEncoding;
  5. const whatwg_encoding_1 = require("whatwg-encoding");
  6. // https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
  7. var State;
  8. (function (State) {
  9. // Before anything starts; can be any of BOM, UTF-16 XML declarations or meta tags
  10. State[State["Begin"] = 0] = "Begin";
  11. // Inside of a BOM
  12. State[State["BOM16BE"] = 1] = "BOM16BE";
  13. State[State["BOM16LE"] = 2] = "BOM16LE";
  14. State[State["BOM8"] = 3] = "BOM8";
  15. // XML prefix
  16. State[State["UTF16LE_XML_PREFIX"] = 4] = "UTF16LE_XML_PREFIX";
  17. State[State["BeginLT"] = 5] = "BeginLT";
  18. State[State["UTF16BE_XML_PREFIX"] = 6] = "UTF16BE_XML_PREFIX";
  19. // Waiting for opening `<`
  20. State[State["BeforeTag"] = 7] = "BeforeTag";
  21. // After the opening `<`
  22. State[State["BeforeTagName"] = 8] = "BeforeTagName";
  23. // After `</`
  24. State[State["BeforeCloseTagName"] = 9] = "BeforeCloseTagName";
  25. // Beginning of a comment
  26. State[State["CommentStart"] = 10] = "CommentStart";
  27. // End of a comment
  28. State[State["CommentEnd"] = 11] = "CommentEnd";
  29. // A tag name that could be `meta`
  30. State[State["TagNameMeta"] = 12] = "TagNameMeta";
  31. // A tag name that is not `meta`
  32. State[State["TagNameOther"] = 13] = "TagNameOther";
  33. // XML declaration
  34. State[State["XMLDeclaration"] = 14] = "XMLDeclaration";
  35. State[State["XMLDeclarationBeforeEncoding"] = 15] = "XMLDeclarationBeforeEncoding";
  36. State[State["XMLDeclarationAfterEncoding"] = 16] = "XMLDeclarationAfterEncoding";
  37. State[State["XMLDeclarationBeforeValue"] = 17] = "XMLDeclarationBeforeValue";
  38. State[State["XMLDeclarationValue"] = 18] = "XMLDeclarationValue";
  39. // Anything that looks like a tag, but doesn't fit in the above categories
  40. State[State["WeirdTag"] = 19] = "WeirdTag";
  41. State[State["BeforeAttribute"] = 20] = "BeforeAttribute";
  42. /*
  43. * Attributes in meta tag — we compare them to our set here, and back out
  44. * We care about four attributes: http-equiv, content-type, content, charset
  45. */
  46. State[State["MetaAttribHttpEquiv"] = 21] = "MetaAttribHttpEquiv";
  47. // The value has to be `content-type`
  48. State[State["MetaAttribHttpEquivValue"] = 22] = "MetaAttribHttpEquivValue";
  49. State[State["MetaAttribC"] = 23] = "MetaAttribC";
  50. State[State["MetaAttribContent"] = 24] = "MetaAttribContent";
  51. State[State["MetaAttribCharset"] = 25] = "MetaAttribCharset";
  52. // Waiting for whitespace
  53. State[State["MetaAttribAfterName"] = 26] = "MetaAttribAfterName";
  54. State[State["MetaContentValueQuotedBeforeEncoding"] = 27] = "MetaContentValueQuotedBeforeEncoding";
  55. State[State["MetaContentValueQuotedAfterEncoding"] = 28] = "MetaContentValueQuotedAfterEncoding";
  56. State[State["MetaContentValueQuotedBeforeValue"] = 29] = "MetaContentValueQuotedBeforeValue";
  57. State[State["MetaContentValueQuotedValueQuoted"] = 30] = "MetaContentValueQuotedValueQuoted";
  58. State[State["MetaContentValueQuotedValueUnquoted"] = 31] = "MetaContentValueQuotedValueUnquoted";
  59. State[State["MetaContentValueUnquotedBeforeEncoding"] = 32] = "MetaContentValueUnquotedBeforeEncoding";
  60. State[State["MetaContentValueUnquotedBeforeValue"] = 33] = "MetaContentValueUnquotedBeforeValue";
  61. State[State["MetaContentValueUnquotedValueQuoted"] = 34] = "MetaContentValueUnquotedValueQuoted";
  62. State[State["MetaContentValueUnquotedValueUnquoted"] = 35] = "MetaContentValueUnquotedValueUnquoted";
  63. State[State["AnyAttribName"] = 36] = "AnyAttribName";
  64. // After the name of an attribute, before the equals sign
  65. State[State["AfterAttributeName"] = 37] = "AfterAttributeName";
  66. // After `=`
  67. State[State["BeforeAttributeValue"] = 38] = "BeforeAttributeValue";
  68. State[State["AttributeValueQuoted"] = 39] = "AttributeValueQuoted";
  69. State[State["AttributeValueUnquoted"] = 40] = "AttributeValueUnquoted";
  70. })(State || (State = {}));
  71. var ResultType;
  72. (function (ResultType) {
  73. // Byte order mark
  74. ResultType[ResultType["BOM"] = 0] = "BOM";
  75. // User- or transport layer-defined
  76. ResultType[ResultType["PASSED"] = 1] = "PASSED";
  77. // XML prefixes
  78. ResultType[ResultType["XML_PREFIX"] = 2] = "XML_PREFIX";
  79. // Meta tag
  80. ResultType[ResultType["META_TAG"] = 3] = "META_TAG";
  81. // XML encoding
  82. ResultType[ResultType["XML_ENCODING"] = 4] = "XML_ENCODING";
  83. // Default
  84. ResultType[ResultType["DEFAULT"] = 5] = "DEFAULT";
  85. })(ResultType || (exports.ResultType = ResultType = {}));
  86. var AttribType;
  87. (function (AttribType) {
  88. AttribType[AttribType["None"] = 0] = "None";
  89. AttribType[AttribType["HttpEquiv"] = 1] = "HttpEquiv";
  90. AttribType[AttribType["Content"] = 2] = "Content";
  91. AttribType[AttribType["Charset"] = 3] = "Charset";
  92. })(AttribType || (AttribType = {}));
  93. var Chars;
  94. (function (Chars) {
  95. Chars[Chars["NIL"] = 0] = "NIL";
  96. Chars[Chars["TAB"] = 9] = "TAB";
  97. Chars[Chars["LF"] = 10] = "LF";
  98. Chars[Chars["CR"] = 13] = "CR";
  99. Chars[Chars["SPACE"] = 32] = "SPACE";
  100. Chars[Chars["EXCLAMATION"] = 33] = "EXCLAMATION";
  101. Chars[Chars["DQUOTE"] = 34] = "DQUOTE";
  102. Chars[Chars["SQUOTE"] = 39] = "SQUOTE";
  103. Chars[Chars["DASH"] = 45] = "DASH";
  104. Chars[Chars["SLASH"] = 47] = "SLASH";
  105. Chars[Chars["SEMICOLON"] = 59] = "SEMICOLON";
  106. Chars[Chars["LT"] = 60] = "LT";
  107. Chars[Chars["EQUALS"] = 61] = "EQUALS";
  108. Chars[Chars["GT"] = 62] = "GT";
  109. Chars[Chars["QUESTION"] = 63] = "QUESTION";
  110. Chars[Chars["UpperA"] = 65] = "UpperA";
  111. Chars[Chars["UpperZ"] = 90] = "UpperZ";
  112. Chars[Chars["LowerA"] = 97] = "LowerA";
  113. Chars[Chars["LowerZ"] = 122] = "LowerZ";
  114. })(Chars || (Chars = {}));
  115. const SPACE_CHARACTERS = new Set([Chars.SPACE, Chars.LF, Chars.CR, Chars.TAB]);
  116. const END_OF_UNQUOTED_ATTRIBUTE_VALUE = new Set([
  117. Chars.SPACE,
  118. Chars.LF,
  119. Chars.CR,
  120. Chars.TAB,
  121. Chars.GT,
  122. ]);
  123. function toUint8Array(str) {
  124. const arr = new Uint8Array(str.length);
  125. for (let i = 0; i < str.length; i++) {
  126. arr[i] = str.charCodeAt(i);
  127. }
  128. return arr;
  129. }
  130. exports.STRINGS = {
  131. UTF8_BOM: new Uint8Array([0xef, 0xbb, 0xbf]),
  132. UTF16LE_BOM: new Uint8Array([0xff, 0xfe]),
  133. UTF16BE_BOM: new Uint8Array([0xfe, 0xff]),
  134. UTF16LE_XML_PREFIX: new Uint8Array([0x3c, 0x0, 0x3f, 0x0, 0x78, 0x0]),
  135. UTF16BE_XML_PREFIX: new Uint8Array([0x0, 0x3c, 0x0, 0x3f, 0x0, 0x78]),
  136. XML_DECLARATION: toUint8Array("<?xml"),
  137. ENCODING: toUint8Array("encoding"),
  138. META: toUint8Array("meta"),
  139. HTTP_EQUIV: toUint8Array("http-equiv"),
  140. CONTENT: toUint8Array("content"),
  141. CONTENT_TYPE: toUint8Array("content-type"),
  142. CHARSET: toUint8Array("charset"),
  143. COMMENT_START: toUint8Array("<!--"),
  144. COMMENT_END: toUint8Array("-->"),
  145. };
  146. function isAsciiAlpha(c) {
  147. return ((c >= Chars.UpperA && c <= Chars.UpperZ) ||
  148. (c >= Chars.LowerA && c <= Chars.LowerZ));
  149. }
  150. function isQuote(c) {
  151. return c === Chars.DQUOTE || c === Chars.SQUOTE;
  152. }
  153. class Sniffer {
  154. setResult(label, type) {
  155. if (this.resultType === ResultType.DEFAULT || this.resultType > type) {
  156. const encoding = (0, whatwg_encoding_1.labelToName)(label);
  157. if (encoding) {
  158. this.encoding =
  159. // Check if we are in a meta tag and the encoding is `x-user-defined`
  160. type === ResultType.META_TAG &&
  161. encoding === "x-user-defined"
  162. ? "windows-1252"
  163. : // Check if we are in a meta tag or xml declaration, and the encoding is UTF-16
  164. (type === ResultType.META_TAG ||
  165. type === ResultType.XML_ENCODING) &&
  166. (encoding === "UTF-16LE" || encoding === "UTF-16BE")
  167. ? "UTF-8"
  168. : encoding;
  169. this.resultType = type;
  170. }
  171. }
  172. }
  173. constructor({ maxBytes = 1024, userEncoding, transportLayerEncodingLabel, defaultEncoding, } = {}) {
  174. /** The offset of the previous buffers. */
  175. this.offset = 0;
  176. this.state = State.Begin;
  177. this.sectionIndex = 0;
  178. this.attribType = AttribType.None;
  179. /**
  180. * Indicates if the `http-equiv` is `content-type`.
  181. *
  182. * Initially `null`, a boolean when a value is found.
  183. */
  184. this.gotPragma = null;
  185. this.needsPragma = null;
  186. this.inMetaTag = false;
  187. this.encoding = "windows-1252";
  188. this.resultType = ResultType.DEFAULT;
  189. this.quoteCharacter = 0;
  190. this.attributeValue = [];
  191. this.maxBytes = maxBytes;
  192. if (userEncoding) {
  193. this.setResult(userEncoding, ResultType.PASSED);
  194. }
  195. if (transportLayerEncodingLabel) {
  196. this.setResult(transportLayerEncodingLabel, ResultType.PASSED);
  197. }
  198. if (defaultEncoding) {
  199. this.setResult(defaultEncoding, ResultType.DEFAULT);
  200. }
  201. }
  202. stateBegin(c) {
  203. switch (c) {
  204. case exports.STRINGS.UTF16BE_BOM[0]: {
  205. this.state = State.BOM16BE;
  206. break;
  207. }
  208. case exports.STRINGS.UTF16LE_BOM[0]: {
  209. this.state = State.BOM16LE;
  210. break;
  211. }
  212. case exports.STRINGS.UTF8_BOM[0]: {
  213. this.sectionIndex = 1;
  214. this.state = State.BOM8;
  215. break;
  216. }
  217. case Chars.NIL: {
  218. this.state = State.UTF16BE_XML_PREFIX;
  219. this.sectionIndex = 1;
  220. break;
  221. }
  222. case Chars.LT: {
  223. this.state = State.BeginLT;
  224. break;
  225. }
  226. default: {
  227. this.state = State.BeforeTag;
  228. }
  229. }
  230. }
  231. stateBeginLT(c) {
  232. if (c === Chars.NIL) {
  233. this.state = State.UTF16LE_XML_PREFIX;
  234. this.sectionIndex = 2;
  235. }
  236. else if (c === Chars.QUESTION) {
  237. this.state = State.XMLDeclaration;
  238. this.sectionIndex = 2;
  239. }
  240. else {
  241. this.state = State.BeforeTagName;
  242. this.stateBeforeTagName(c);
  243. }
  244. }
  245. stateUTF16BE_XML_PREFIX(c) {
  246. // Advance position in the section
  247. if (this.advanceSection(exports.STRINGS.UTF16BE_XML_PREFIX, c)) {
  248. if (this.sectionIndex === exports.STRINGS.UTF16BE_XML_PREFIX.length) {
  249. // We have the whole prefix
  250. this.setResult("utf-16be", ResultType.XML_PREFIX);
  251. }
  252. }
  253. else {
  254. this.state = State.BeforeTag;
  255. this.stateBeforeTag(c);
  256. }
  257. }
  258. stateUTF16LE_XML_PREFIX(c) {
  259. // Advance position in the section
  260. if (this.advanceSection(exports.STRINGS.UTF16LE_XML_PREFIX, c)) {
  261. if (this.sectionIndex === exports.STRINGS.UTF16LE_XML_PREFIX.length) {
  262. // We have the whole prefix
  263. this.setResult("utf-16le", ResultType.XML_PREFIX);
  264. }
  265. }
  266. else {
  267. this.state = State.BeforeTag;
  268. this.stateBeforeTag(c);
  269. }
  270. }
  271. stateBOM16LE(c) {
  272. if (c === exports.STRINGS.UTF16LE_BOM[1]) {
  273. this.setResult("utf-16le", ResultType.BOM);
  274. }
  275. else {
  276. this.state = State.BeforeTag;
  277. this.stateBeforeTag(c);
  278. }
  279. }
  280. stateBOM16BE(c) {
  281. if (c === exports.STRINGS.UTF16BE_BOM[1]) {
  282. this.setResult("utf-16be", ResultType.BOM);
  283. }
  284. else {
  285. this.state = State.BeforeTag;
  286. this.stateBeforeTag(c);
  287. }
  288. }
  289. stateBOM8(c) {
  290. if (this.advanceSection(exports.STRINGS.UTF8_BOM, c) &&
  291. this.sectionIndex === exports.STRINGS.UTF8_BOM.length) {
  292. this.setResult("utf-8", ResultType.BOM);
  293. }
  294. }
  295. stateBeforeTag(c) {
  296. if (c === Chars.LT) {
  297. this.state = State.BeforeTagName;
  298. this.inMetaTag = false;
  299. }
  300. }
  301. /**
  302. * We have seen a `<`, and now have to figure out what to do.
  303. *
  304. * Options:
  305. * - `<meta`
  306. * - Any other tag
  307. * - A closing tag
  308. * - `<!--`
  309. * - An XML declaration
  310. *
  311. */
  312. stateBeforeTagName(c) {
  313. if (isAsciiAlpha(c)) {
  314. if ((c | 0x20) === exports.STRINGS.META[0]) {
  315. this.sectionIndex = 1;
  316. this.state = State.TagNameMeta;
  317. }
  318. else {
  319. this.state = State.TagNameOther;
  320. }
  321. }
  322. else
  323. switch (c) {
  324. case Chars.SLASH: {
  325. this.state = State.BeforeCloseTagName;
  326. break;
  327. }
  328. case Chars.EXCLAMATION: {
  329. this.state = State.CommentStart;
  330. this.sectionIndex = 2;
  331. break;
  332. }
  333. case Chars.QUESTION: {
  334. this.state = State.WeirdTag;
  335. break;
  336. }
  337. default: {
  338. this.state = State.BeforeTag;
  339. this.stateBeforeTag(c);
  340. }
  341. }
  342. }
  343. stateBeforeCloseTagName(c) {
  344. this.state = isAsciiAlpha(c)
  345. ? // Switch to `TagNameOther`; the HTML spec allows attributes here as well.
  346. State.TagNameOther
  347. : State.WeirdTag;
  348. }
  349. stateCommentStart(c) {
  350. if (this.advanceSection(exports.STRINGS.COMMENT_START, c)) {
  351. if (this.sectionIndex === exports.STRINGS.COMMENT_START.length) {
  352. this.state = State.CommentEnd;
  353. // The -- of the comment start can be part of the end.
  354. this.sectionIndex = 2;
  355. }
  356. }
  357. else {
  358. this.state = State.WeirdTag;
  359. this.stateWeirdTag(c);
  360. }
  361. }
  362. stateCommentEnd(c) {
  363. if (this.advanceSection(exports.STRINGS.COMMENT_END, c)) {
  364. if (this.sectionIndex === exports.STRINGS.COMMENT_END.length) {
  365. this.state = State.BeforeTag;
  366. }
  367. }
  368. else if (c === Chars.DASH) {
  369. /*
  370. * If we are here, we know we expected a `>` above.
  371. * Set this to 2, to support many dashes before the closing `>`.
  372. */
  373. this.sectionIndex = 2;
  374. }
  375. }
  376. /**
  377. * Any section starting with `<!`, `<?`, `</`, without being a closing tag or comment.
  378. */
  379. stateWeirdTag(c) {
  380. if (c === Chars.GT) {
  381. this.state = State.BeforeTag;
  382. }
  383. }
  384. /**
  385. * Advances the section, ignoring upper/lower case.
  386. *
  387. * Make sure the section has left-over characters before calling.
  388. *
  389. * @returns `false` if we did not match the section.
  390. */
  391. advanceSectionIC(section, c) {
  392. return this.advanceSection(section, c | 0x20);
  393. }
  394. /**
  395. * Advances the section.
  396. *
  397. * Make sure the section has left-over characters before calling.
  398. *
  399. * @returns `false` if we did not match the section.
  400. */
  401. advanceSection(section, c) {
  402. if (section[this.sectionIndex] === c) {
  403. this.sectionIndex++;
  404. return true;
  405. }
  406. this.sectionIndex = 0;
  407. return false;
  408. }
  409. stateTagNameMeta(c) {
  410. if (this.sectionIndex < exports.STRINGS.META.length) {
  411. if (this.advanceSectionIC(exports.STRINGS.META, c)) {
  412. return;
  413. }
  414. }
  415. else if (SPACE_CHARACTERS.has(c)) {
  416. this.inMetaTag = true;
  417. this.gotPragma = null;
  418. this.needsPragma = null;
  419. this.state = State.BeforeAttribute;
  420. return;
  421. }
  422. this.state = State.TagNameOther;
  423. // Reconsume in case there is a `>`.
  424. this.stateTagNameOther(c);
  425. }
  426. stateTagNameOther(c) {
  427. if (SPACE_CHARACTERS.has(c)) {
  428. this.state = State.BeforeAttribute;
  429. }
  430. else if (c === Chars.GT) {
  431. this.state = State.BeforeTag;
  432. }
  433. }
  434. stateBeforeAttribute(c) {
  435. if (SPACE_CHARACTERS.has(c))
  436. return;
  437. if (this.inMetaTag) {
  438. const lower = c | 0x20;
  439. if (lower === exports.STRINGS.HTTP_EQUIV[0]) {
  440. this.sectionIndex = 1;
  441. this.state = State.MetaAttribHttpEquiv;
  442. return;
  443. }
  444. else if (lower === exports.STRINGS.CHARSET[0]) {
  445. this.sectionIndex = 1;
  446. this.state = State.MetaAttribC;
  447. return;
  448. }
  449. }
  450. this.state =
  451. c === Chars.SLASH || c === Chars.GT
  452. ? State.BeforeTag
  453. : State.AnyAttribName;
  454. }
  455. handleMetaAttrib(c, section, type) {
  456. if (this.advanceSectionIC(section, c)) {
  457. if (this.sectionIndex === section.length) {
  458. this.attribType = type;
  459. this.state = State.MetaAttribAfterName;
  460. }
  461. }
  462. else {
  463. this.state = State.AnyAttribName;
  464. this.stateAnyAttribName(c);
  465. }
  466. }
  467. stateMetaAttribHttpEquiv(c) {
  468. this.handleMetaAttrib(c, exports.STRINGS.HTTP_EQUIV, AttribType.HttpEquiv);
  469. }
  470. stateMetaAttribC(c) {
  471. const lower = c | 0x20;
  472. if (lower === exports.STRINGS.CHARSET[1]) {
  473. this.sectionIndex = 2;
  474. this.state = State.MetaAttribCharset;
  475. }
  476. else if (lower === exports.STRINGS.CONTENT[1]) {
  477. this.sectionIndex = 2;
  478. this.state = State.MetaAttribContent;
  479. }
  480. else {
  481. this.state = State.AnyAttribName;
  482. this.stateAnyAttribName(c);
  483. }
  484. }
  485. stateMetaAttribCharset(c) {
  486. this.handleMetaAttrib(c, exports.STRINGS.CHARSET, AttribType.Charset);
  487. }
  488. stateMetaAttribContent(c) {
  489. this.handleMetaAttrib(c, exports.STRINGS.CONTENT, AttribType.Content);
  490. }
  491. stateMetaAttribAfterName(c) {
  492. if (SPACE_CHARACTERS.has(c) || c === Chars.EQUALS) {
  493. this.state = State.AfterAttributeName;
  494. this.stateAfterAttributeName(c);
  495. }
  496. else {
  497. this.state = State.AnyAttribName;
  498. this.stateAnyAttribName(c);
  499. }
  500. }
  501. stateAnyAttribName(c) {
  502. if (SPACE_CHARACTERS.has(c)) {
  503. this.attribType = AttribType.None;
  504. this.state = State.AfterAttributeName;
  505. }
  506. else if (c === Chars.SLASH || c === Chars.GT) {
  507. this.state = State.BeforeTag;
  508. }
  509. else if (c === Chars.EQUALS) {
  510. this.state = State.BeforeAttributeValue;
  511. }
  512. }
  513. stateAfterAttributeName(c) {
  514. if (SPACE_CHARACTERS.has(c))
  515. return;
  516. if (c === Chars.EQUALS) {
  517. this.state = State.BeforeAttributeValue;
  518. }
  519. else {
  520. this.state = State.BeforeAttribute;
  521. this.stateBeforeAttribute(c);
  522. }
  523. }
  524. stateBeforeAttributeValue(c) {
  525. if (SPACE_CHARACTERS.has(c))
  526. return;
  527. this.attributeValue.length = 0;
  528. this.sectionIndex = 0;
  529. if (isQuote(c)) {
  530. this.quoteCharacter = c;
  531. this.state =
  532. this.attribType === AttribType.Content
  533. ? State.MetaContentValueQuotedBeforeEncoding
  534. : this.attribType === AttribType.HttpEquiv
  535. ? State.MetaAttribHttpEquivValue
  536. : State.AttributeValueQuoted;
  537. }
  538. else if (this.attribType === AttribType.Content) {
  539. this.state = State.MetaContentValueUnquotedBeforeEncoding;
  540. this.stateMetaContentValueUnquotedBeforeEncoding(c);
  541. }
  542. else if (this.attribType === AttribType.HttpEquiv) {
  543. // We use `quoteCharacter = 0` to signify that the value is unquoted.
  544. this.quoteCharacter = 0;
  545. this.sectionIndex = 0;
  546. this.state = State.MetaAttribHttpEquivValue;
  547. this.stateMetaAttribHttpEquivValue(c);
  548. }
  549. else {
  550. this.state = State.AttributeValueUnquoted;
  551. this.stateAttributeValueUnquoted(c);
  552. }
  553. }
  554. // The value has to be `content-type`
  555. stateMetaAttribHttpEquivValue(c) {
  556. if (this.sectionIndex === exports.STRINGS.CONTENT_TYPE.length) {
  557. if (this.quoteCharacter === 0
  558. ? END_OF_UNQUOTED_ATTRIBUTE_VALUE.has(c)
  559. : c === this.quoteCharacter) {
  560. if (this.needsPragma !== null) {
  561. this.setResult(this.needsPragma, ResultType.META_TAG);
  562. }
  563. else if (this.gotPragma === null) {
  564. this.gotPragma = true;
  565. }
  566. this.state = State.BeforeAttribute;
  567. return;
  568. }
  569. }
  570. else if (this.advanceSectionIC(exports.STRINGS.CONTENT_TYPE, c)) {
  571. return;
  572. }
  573. this.gotPragma = false;
  574. if (this.quoteCharacter === 0) {
  575. this.state = State.AttributeValueUnquoted;
  576. this.stateAttributeValueUnquoted(c);
  577. }
  578. else {
  579. this.state = State.AttributeValueQuoted;
  580. this.stateAttributeValueQuoted(c);
  581. }
  582. }
  583. handleMetaContentValue() {
  584. if (this.attributeValue.length === 0)
  585. return;
  586. const encoding = String.fromCharCode(...this.attributeValue);
  587. if (this.gotPragma) {
  588. this.setResult(encoding, ResultType.META_TAG);
  589. }
  590. else if (this.needsPragma === null) {
  591. // Don't override a previous result.
  592. this.needsPragma = encoding;
  593. }
  594. this.attributeValue.length = 0;
  595. }
  596. handleAttributeValue() {
  597. if (this.attribType === AttribType.Charset) {
  598. this.setResult(String.fromCharCode(...this.attributeValue), ResultType.META_TAG);
  599. }
  600. }
  601. stateAttributeValueUnquoted(c) {
  602. if (SPACE_CHARACTERS.has(c)) {
  603. this.handleAttributeValue();
  604. this.state = State.BeforeAttribute;
  605. }
  606. else if (c === Chars.SLASH || c === Chars.GT) {
  607. this.handleAttributeValue();
  608. this.state = State.BeforeTag;
  609. }
  610. else if (this.attribType === AttribType.Charset) {
  611. this.attributeValue.push(c | (c >= 0x41 && c <= 0x5a ? 0x20 : 0));
  612. }
  613. }
  614. findMetaContentEncoding(c) {
  615. if (this.advanceSectionIC(exports.STRINGS.CHARSET, c)) {
  616. if (this.sectionIndex === exports.STRINGS.CHARSET.length) {
  617. return true;
  618. }
  619. }
  620. else {
  621. // If we encountered another `c`, assume we started over.
  622. this.sectionIndex = Number(c === exports.STRINGS.CHARSET[0]);
  623. }
  624. return false;
  625. }
  626. stateMetaContentValueUnquotedBeforeEncoding(c) {
  627. if (END_OF_UNQUOTED_ATTRIBUTE_VALUE.has(c)) {
  628. this.stateAttributeValueUnquoted(c);
  629. }
  630. else if (this.sectionIndex === exports.STRINGS.CHARSET.length) {
  631. if (c === Chars.EQUALS) {
  632. this.state = State.MetaContentValueUnquotedBeforeValue;
  633. }
  634. }
  635. else {
  636. this.findMetaContentEncoding(c);
  637. }
  638. }
  639. stateMetaContentValueUnquotedBeforeValue(c) {
  640. if (isQuote(c)) {
  641. this.quoteCharacter = c;
  642. this.state = State.MetaContentValueUnquotedValueQuoted;
  643. }
  644. else if (END_OF_UNQUOTED_ATTRIBUTE_VALUE.has(c)) {
  645. // Can't have spaces here, as it would no longer be part of the attribute value.
  646. this.stateAttributeValueUnquoted(c);
  647. }
  648. else {
  649. this.state = State.MetaContentValueUnquotedValueUnquoted;
  650. this.stateMetaContentValueUnquotedValueUnquoted(c);
  651. }
  652. }
  653. stateMetaContentValueUnquotedValueQuoted(c) {
  654. if (END_OF_UNQUOTED_ATTRIBUTE_VALUE.has(c)) {
  655. // Quotes weren't matched, so we're done.
  656. this.stateAttributeValueUnquoted(c);
  657. }
  658. else if (c === this.quoteCharacter) {
  659. this.handleMetaContentValue();
  660. this.state = State.AttributeValueUnquoted;
  661. }
  662. else {
  663. this.attributeValue.push(c | (c >= 0x41 && c <= 0x5a ? 0x20 : 0));
  664. }
  665. }
  666. stateMetaContentValueUnquotedValueUnquoted(c) {
  667. if (END_OF_UNQUOTED_ATTRIBUTE_VALUE.has(c) || c === Chars.SEMICOLON) {
  668. this.handleMetaContentValue();
  669. this.state = State.AttributeValueUnquoted;
  670. this.stateAttributeValueUnquoted(c);
  671. }
  672. else {
  673. this.attributeValue.push(c | (c >= 0x41 && c <= 0x5a ? 0x20 : 0));
  674. }
  675. }
  676. stateMetaContentValueQuotedValueUnquoted(c) {
  677. if (isQuote(c) || SPACE_CHARACTERS.has(c) || c === Chars.SEMICOLON) {
  678. this.handleMetaContentValue();
  679. // We are done with the value, but might not be at the end of the attribute
  680. this.state = State.AttributeValueQuoted;
  681. this.stateAttributeValueQuoted(c);
  682. }
  683. else {
  684. this.attributeValue.push(c | (c >= 0x41 && c <= 0x5a ? 0x20 : 0));
  685. }
  686. }
  687. stateMetaContentValueQuotedValueQuoted(c) {
  688. if (isQuote(c)) {
  689. // We have reached the end of our value.
  690. if (c !== this.quoteCharacter) {
  691. // Only handle the value if inner quotes were matched.
  692. this.handleMetaContentValue();
  693. }
  694. this.state = State.AttributeValueQuoted;
  695. this.stateAttributeValueQuoted(c);
  696. }
  697. else {
  698. this.attributeValue.push(c | (c >= 0x41 && c <= 0x5a ? 0x20 : 0));
  699. }
  700. }
  701. stateMetaContentValueQuotedBeforeEncoding(c) {
  702. if (c === this.quoteCharacter) {
  703. this.stateAttributeValueQuoted(c);
  704. }
  705. else if (this.findMetaContentEncoding(c)) {
  706. this.state = State.MetaContentValueQuotedAfterEncoding;
  707. }
  708. }
  709. stateMetaContentValueQuotedAfterEncoding(c) {
  710. if (c === Chars.EQUALS) {
  711. this.state = State.MetaContentValueQuotedBeforeValue;
  712. }
  713. else if (!SPACE_CHARACTERS.has(c)) {
  714. // Look for the next encoding
  715. this.state = State.MetaContentValueQuotedBeforeEncoding;
  716. this.stateMetaContentValueQuotedBeforeEncoding(c);
  717. }
  718. }
  719. stateMetaContentValueQuotedBeforeValue(c) {
  720. if (c === this.quoteCharacter) {
  721. this.stateAttributeValueQuoted(c);
  722. }
  723. else if (isQuote(c)) {
  724. this.state = State.MetaContentValueQuotedValueQuoted;
  725. }
  726. else if (!SPACE_CHARACTERS.has(c)) {
  727. this.state = State.MetaContentValueQuotedValueUnquoted;
  728. this.stateMetaContentValueQuotedValueUnquoted(c);
  729. }
  730. }
  731. stateAttributeValueQuoted(c) {
  732. if (c === this.quoteCharacter) {
  733. this.handleAttributeValue();
  734. this.state = State.BeforeAttribute;
  735. }
  736. else if (this.attribType === AttribType.Charset) {
  737. this.attributeValue.push(c | (c >= 0x41 && c <= 0x5a ? 0x20 : 0));
  738. }
  739. }
  740. // Read STRINGS.XML_DECLARATION
  741. stateXMLDeclaration(c) {
  742. if (this.advanceSection(exports.STRINGS.XML_DECLARATION, c)) {
  743. if (this.sectionIndex === exports.STRINGS.XML_DECLARATION.length) {
  744. this.sectionIndex = 0;
  745. this.state = State.XMLDeclarationBeforeEncoding;
  746. }
  747. }
  748. else {
  749. this.state = State.WeirdTag;
  750. }
  751. }
  752. stateXMLDeclarationBeforeEncoding(c) {
  753. if (this.advanceSection(exports.STRINGS.ENCODING, c)) {
  754. if (this.sectionIndex === exports.STRINGS.ENCODING.length) {
  755. this.state = State.XMLDeclarationAfterEncoding;
  756. }
  757. }
  758. else if (c === Chars.GT) {
  759. this.state = State.BeforeTag;
  760. }
  761. else {
  762. // If we encountered another `c`, assume we started over.
  763. this.sectionIndex = Number(c === exports.STRINGS.ENCODING[0]);
  764. }
  765. }
  766. stateXMLDeclarationAfterEncoding(c) {
  767. if (c === Chars.EQUALS) {
  768. this.state = State.XMLDeclarationBeforeValue;
  769. }
  770. else if (c > Chars.SPACE) {
  771. this.state = State.WeirdTag;
  772. this.stateWeirdTag(c);
  773. }
  774. }
  775. stateXMLDeclarationBeforeValue(c) {
  776. if (isQuote(c)) {
  777. this.attributeValue.length = 0;
  778. this.state = State.XMLDeclarationValue;
  779. }
  780. else if (c > Chars.SPACE) {
  781. this.state = State.WeirdTag;
  782. this.stateWeirdTag(c);
  783. }
  784. }
  785. stateXMLDeclarationValue(c) {
  786. if (isQuote(c)) {
  787. this.setResult(String.fromCharCode(...this.attributeValue), ResultType.XML_ENCODING);
  788. this.state = State.WeirdTag;
  789. }
  790. else if (c === Chars.GT) {
  791. this.state = State.BeforeTag;
  792. }
  793. else if (c <= Chars.SPACE) {
  794. this.state = State.WeirdTag;
  795. }
  796. else {
  797. this.attributeValue.push(c | (c >= 0x41 && c <= 0x5a ? 0x20 : 0));
  798. }
  799. }
  800. write(buffer) {
  801. let index = 0;
  802. for (; index < buffer.length && this.offset + index < this.maxBytes; index++) {
  803. const c = buffer[index];
  804. switch (this.state) {
  805. case State.Begin: {
  806. this.stateBegin(c);
  807. break;
  808. }
  809. case State.BOM16BE: {
  810. this.stateBOM16BE(c);
  811. break;
  812. }
  813. case State.BOM16LE: {
  814. this.stateBOM16LE(c);
  815. break;
  816. }
  817. case State.BOM8: {
  818. this.stateBOM8(c);
  819. break;
  820. }
  821. case State.UTF16LE_XML_PREFIX: {
  822. this.stateUTF16LE_XML_PREFIX(c);
  823. break;
  824. }
  825. case State.BeginLT: {
  826. this.stateBeginLT(c);
  827. break;
  828. }
  829. case State.UTF16BE_XML_PREFIX: {
  830. this.stateUTF16BE_XML_PREFIX(c);
  831. break;
  832. }
  833. case State.BeforeTag: {
  834. // Optimization: Skip all characters until we find a `<`
  835. const idx = buffer.indexOf(Chars.LT, index);
  836. if (idx === -1) {
  837. // We are done with this buffer. Stay in the state and try on the next one.
  838. index = buffer.length;
  839. }
  840. else {
  841. index = idx;
  842. this.stateBeforeTag(Chars.LT);
  843. }
  844. break;
  845. }
  846. case State.BeforeTagName: {
  847. this.stateBeforeTagName(c);
  848. break;
  849. }
  850. case State.BeforeCloseTagName: {
  851. this.stateBeforeCloseTagName(c);
  852. break;
  853. }
  854. case State.CommentStart: {
  855. this.stateCommentStart(c);
  856. break;
  857. }
  858. case State.CommentEnd: {
  859. this.stateCommentEnd(c);
  860. break;
  861. }
  862. case State.TagNameMeta: {
  863. this.stateTagNameMeta(c);
  864. break;
  865. }
  866. case State.TagNameOther: {
  867. this.stateTagNameOther(c);
  868. break;
  869. }
  870. case State.XMLDeclaration: {
  871. this.stateXMLDeclaration(c);
  872. break;
  873. }
  874. case State.XMLDeclarationBeforeEncoding: {
  875. this.stateXMLDeclarationBeforeEncoding(c);
  876. break;
  877. }
  878. case State.XMLDeclarationAfterEncoding: {
  879. this.stateXMLDeclarationAfterEncoding(c);
  880. break;
  881. }
  882. case State.XMLDeclarationBeforeValue: {
  883. this.stateXMLDeclarationBeforeValue(c);
  884. break;
  885. }
  886. case State.XMLDeclarationValue: {
  887. this.stateXMLDeclarationValue(c);
  888. break;
  889. }
  890. case State.WeirdTag: {
  891. this.stateWeirdTag(c);
  892. break;
  893. }
  894. case State.BeforeAttribute: {
  895. this.stateBeforeAttribute(c);
  896. break;
  897. }
  898. case State.MetaAttribHttpEquiv: {
  899. this.stateMetaAttribHttpEquiv(c);
  900. break;
  901. }
  902. case State.MetaAttribHttpEquivValue: {
  903. this.stateMetaAttribHttpEquivValue(c);
  904. break;
  905. }
  906. case State.MetaAttribC: {
  907. this.stateMetaAttribC(c);
  908. break;
  909. }
  910. case State.MetaAttribContent: {
  911. this.stateMetaAttribContent(c);
  912. break;
  913. }
  914. case State.MetaAttribCharset: {
  915. this.stateMetaAttribCharset(c);
  916. break;
  917. }
  918. case State.MetaAttribAfterName: {
  919. this.stateMetaAttribAfterName(c);
  920. break;
  921. }
  922. case State.MetaContentValueQuotedBeforeEncoding: {
  923. this.stateMetaContentValueQuotedBeforeEncoding(c);
  924. break;
  925. }
  926. case State.MetaContentValueQuotedAfterEncoding: {
  927. this.stateMetaContentValueQuotedAfterEncoding(c);
  928. break;
  929. }
  930. case State.MetaContentValueQuotedBeforeValue: {
  931. this.stateMetaContentValueQuotedBeforeValue(c);
  932. break;
  933. }
  934. case State.MetaContentValueQuotedValueQuoted: {
  935. this.stateMetaContentValueQuotedValueQuoted(c);
  936. break;
  937. }
  938. case State.MetaContentValueQuotedValueUnquoted: {
  939. this.stateMetaContentValueQuotedValueUnquoted(c);
  940. break;
  941. }
  942. case State.MetaContentValueUnquotedBeforeEncoding: {
  943. this.stateMetaContentValueUnquotedBeforeEncoding(c);
  944. break;
  945. }
  946. case State.MetaContentValueUnquotedBeforeValue: {
  947. this.stateMetaContentValueUnquotedBeforeValue(c);
  948. break;
  949. }
  950. case State.MetaContentValueUnquotedValueQuoted: {
  951. this.stateMetaContentValueUnquotedValueQuoted(c);
  952. break;
  953. }
  954. case State.MetaContentValueUnquotedValueUnquoted: {
  955. this.stateMetaContentValueUnquotedValueUnquoted(c);
  956. break;
  957. }
  958. case State.AnyAttribName: {
  959. this.stateAnyAttribName(c);
  960. break;
  961. }
  962. case State.AfterAttributeName: {
  963. this.stateAfterAttributeName(c);
  964. break;
  965. }
  966. case State.BeforeAttributeValue: {
  967. this.stateBeforeAttributeValue(c);
  968. break;
  969. }
  970. case State.AttributeValueQuoted: {
  971. this.stateAttributeValueQuoted(c);
  972. break;
  973. }
  974. case State.AttributeValueUnquoted: {
  975. this.stateAttributeValueUnquoted(c);
  976. break;
  977. }
  978. }
  979. }
  980. this.offset += index;
  981. }
  982. }
  983. exports.Sniffer = Sniffer;
  984. /** Get the encoding for the passed buffer. */
  985. function getEncoding(buffer, options) {
  986. const sniffer = new Sniffer(options);
  987. sniffer.write(buffer);
  988. return sniffer.encoding;
  989. }
  990. //# sourceMappingURL=sniffer.js.map