index.js 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. "use strict";
  2. /**
  3. * @file Batteries-included version of Cheerio. This module includes several
  4. * convenience methods for loading documents from various sources.
  5. */
  6. var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
  7. if (k2 === undefined) k2 = k;
  8. var desc = Object.getOwnPropertyDescriptor(m, k);
  9. if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
  10. desc = { enumerable: true, get: function() { return m[k]; } };
  11. }
  12. Object.defineProperty(o, k2, desc);
  13. }) : (function(o, m, k, k2) {
  14. if (k2 === undefined) k2 = k;
  15. o[k2] = m[k];
  16. }));
  17. var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
  18. Object.defineProperty(o, "default", { enumerable: true, value: v });
  19. }) : function(o, v) {
  20. o["default"] = v;
  21. });
  22. var __exportStar = (this && this.__exportStar) || function(m, exports) {
  23. for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
  24. };
  25. var __importStar = (this && this.__importStar) || (function () {
  26. var ownKeys = function(o) {
  27. ownKeys = Object.getOwnPropertyNames || function (o) {
  28. var ar = [];
  29. for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
  30. return ar;
  31. };
  32. return ownKeys(o);
  33. };
  34. return function (mod) {
  35. if (mod && mod.__esModule) return mod;
  36. var result = {};
  37. if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
  38. __setModuleDefault(result, mod);
  39. return result;
  40. };
  41. })();
  42. var __importDefault = (this && this.__importDefault) || function (mod) {
  43. return (mod && mod.__esModule) ? mod : { "default": mod };
  44. };
  45. Object.defineProperty(exports, "__esModule", { value: true });
  46. exports.merge = exports.contains = void 0;
  47. exports.loadBuffer = loadBuffer;
  48. exports.stringStream = stringStream;
  49. exports.decodeStream = decodeStream;
  50. exports.fromURL = fromURL;
  51. __exportStar(require("./load-parse.js"), exports);
  52. var static_js_1 = require("./static.js");
  53. Object.defineProperty(exports, "contains", { enumerable: true, get: function () { return static_js_1.contains; } });
  54. Object.defineProperty(exports, "merge", { enumerable: true, get: function () { return static_js_1.merge; } });
  55. const parse5_htmlparser2_tree_adapter_1 = require("parse5-htmlparser2-tree-adapter");
  56. const htmlparser2 = __importStar(require("htmlparser2"));
  57. const parse5_parser_stream_1 = require("parse5-parser-stream");
  58. const encoding_sniffer_1 = require("encoding-sniffer");
  59. const undici = __importStar(require("undici"));
  60. const whatwg_mimetype_1 = __importDefault(require("whatwg-mimetype"));
  61. const node_stream_1 = require("node:stream");
  62. const options_js_1 = require("./options.js");
  63. const load_parse_js_1 = require("./load-parse.js");
  64. /**
  65. * Sniffs the encoding of a buffer, then creates a querying function bound to a
  66. * document created from the buffer.
  67. *
  68. * @category Loading
  69. * @example
  70. *
  71. * ```js
  72. * import * as cheerio from 'cheerio';
  73. *
  74. * const buffer = fs.readFileSync('index.html');
  75. * const $ = cheerio.loadBuffer(buffer);
  76. * ```
  77. *
  78. * @param buffer - The buffer to sniff the encoding of.
  79. * @param options - The options to pass to Cheerio.
  80. * @returns The loaded document.
  81. */
  82. function loadBuffer(buffer, options = {}) {
  83. const opts = (0, options_js_1.flattenOptions)(options);
  84. const str = (0, encoding_sniffer_1.decodeBuffer)(buffer, {
  85. defaultEncoding: (opts === null || opts === void 0 ? void 0 : opts.xmlMode) ? 'utf8' : 'windows-1252',
  86. ...options.encoding,
  87. });
  88. return (0, load_parse_js_1.load)(str, opts);
  89. }
  90. function _stringStream(options, cb) {
  91. var _a;
  92. if (options === null || options === void 0 ? void 0 : options._useHtmlParser2) {
  93. const parser = htmlparser2.createDocumentStream((err, document) => cb(err, (0, load_parse_js_1.load)(document)), options);
  94. return new node_stream_1.Writable({
  95. decodeStrings: false,
  96. write(chunk, _encoding, callback) {
  97. if (typeof chunk !== 'string') {
  98. throw new TypeError('Expected a string');
  99. }
  100. parser.write(chunk);
  101. callback();
  102. },
  103. final(callback) {
  104. parser.end();
  105. callback();
  106. },
  107. });
  108. }
  109. options !== null && options !== void 0 ? options : (options = {});
  110. (_a = options.treeAdapter) !== null && _a !== void 0 ? _a : (options.treeAdapter = parse5_htmlparser2_tree_adapter_1.adapter);
  111. if (options.scriptingEnabled !== false) {
  112. options.scriptingEnabled = true;
  113. }
  114. const stream = new parse5_parser_stream_1.ParserStream(options);
  115. (0, node_stream_1.finished)(stream, (err) => cb(err, (0, load_parse_js_1.load)(stream.document)));
  116. return stream;
  117. }
  118. /**
  119. * Creates a stream that parses a sequence of strings into a document.
  120. *
  121. * The stream is a `Writable` stream that accepts strings. When the stream is
  122. * finished, the callback is called with the loaded document.
  123. *
  124. * @category Loading
  125. * @example
  126. *
  127. * ```js
  128. * import * as cheerio from 'cheerio';
  129. * import * as fs from 'fs';
  130. *
  131. * const writeStream = cheerio.stringStream({}, (err, $) => {
  132. * if (err) {
  133. * // Handle error
  134. * }
  135. *
  136. * console.log($('h1').text());
  137. * // Output: Hello, world!
  138. * });
  139. *
  140. * fs.createReadStream('my-document.html', { encoding: 'utf8' }).pipe(
  141. * writeStream,
  142. * );
  143. * ```
  144. *
  145. * @param options - The options to pass to Cheerio.
  146. * @param cb - The callback to call when the stream is finished.
  147. * @returns The writable stream.
  148. */
  149. function stringStream(options, cb) {
  150. return _stringStream((0, options_js_1.flattenOptions)(options), cb);
  151. }
  152. /**
  153. * Parses a stream of buffers into a document.
  154. *
  155. * The stream is a `Writable` stream that accepts buffers. When the stream is
  156. * finished, the callback is called with the loaded document.
  157. *
  158. * @category Loading
  159. * @param options - The options to pass to Cheerio.
  160. * @param cb - The callback to call when the stream is finished.
  161. * @returns The writable stream.
  162. */
  163. function decodeStream(options, cb) {
  164. var _a;
  165. const { encoding = {}, ...cheerioOptions } = options;
  166. const opts = (0, options_js_1.flattenOptions)(cheerioOptions);
  167. // Set the default encoding to UTF-8 for XML mode
  168. (_a = encoding.defaultEncoding) !== null && _a !== void 0 ? _a : (encoding.defaultEncoding = (opts === null || opts === void 0 ? void 0 : opts.xmlMode) ? 'utf8' : 'windows-1252');
  169. const decodeStream = new encoding_sniffer_1.DecodeStream(encoding);
  170. const loadStream = _stringStream(opts, cb);
  171. decodeStream.pipe(loadStream);
  172. return decodeStream;
  173. }
  174. const defaultRequestOptions = {
  175. method: 'GET',
  176. // Allow redirects by default
  177. maxRedirections: 5,
  178. // Set an Accept header
  179. headers: {
  180. accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  181. },
  182. };
  183. /**
  184. * `fromURL` loads a document from a URL.
  185. *
  186. * By default, redirects are allowed and non-2xx responses are rejected.
  187. *
  188. * @category Loading
  189. * @example
  190. *
  191. * ```js
  192. * import * as cheerio from 'cheerio';
  193. *
  194. * const $ = await cheerio.fromURL('https://example.com');
  195. * ```
  196. *
  197. * @param url - The URL to load the document from.
  198. * @param options - The options to pass to Cheerio.
  199. * @returns The loaded document.
  200. */
  201. async function fromURL(url, options = {}) {
  202. var _a;
  203. const { requestOptions = defaultRequestOptions, encoding = {}, ...cheerioOptions } = options;
  204. let undiciStream;
  205. // Add headers if none were supplied.
  206. (_a = requestOptions.headers) !== null && _a !== void 0 ? _a : (requestOptions.headers = defaultRequestOptions.headers);
  207. const promise = new Promise((resolve, reject) => {
  208. undiciStream = undici.stream(url, requestOptions, (res) => {
  209. var _a, _b;
  210. if (res.statusCode < 200 || res.statusCode >= 300) {
  211. throw new undici.errors.ResponseError('Response Error', res.statusCode, {
  212. headers: res.headers,
  213. });
  214. }
  215. const contentTypeHeader = (_a = res.headers['content-type']) !== null && _a !== void 0 ? _a : 'text/html';
  216. const mimeType = new whatwg_mimetype_1.default(Array.isArray(contentTypeHeader)
  217. ? contentTypeHeader[0]
  218. : contentTypeHeader);
  219. if (!mimeType.isHTML() && !mimeType.isXML()) {
  220. throw new RangeError(`The content-type "${mimeType.essence}" is neither HTML nor XML.`);
  221. }
  222. // Forward the charset from the header to the decodeStream.
  223. encoding.transportLayerEncodingLabel = mimeType.parameters.get('charset');
  224. /*
  225. * If we allow redirects, we will have entries in the history.
  226. * The last entry will be the final URL.
  227. */
  228. const history = (_b = res.context) === null || _b === void 0 ? void 0 : _b.history;
  229. const opts = {
  230. encoding,
  231. // Set XML mode based on the MIME type.
  232. xmlMode: mimeType.isXML(),
  233. // Set the `baseURL` to the final URL.
  234. baseURL: history ? history[history.length - 1] : url,
  235. ...cheerioOptions,
  236. };
  237. return decodeStream(opts, (err, $) => (err ? reject(err) : resolve($)));
  238. });
  239. });
  240. // Let's make sure the request is completed before returning the promise.
  241. await undiciStream;
  242. return promise;
  243. }
  244. //# sourceMappingURL=index.js.map