deterministicGrouping.js 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561
  1. /*
  2. MIT License http://www.opensource.org/licenses/mit-license.php
  3. Author Tobias Koppers @sokra
  4. */
  5. "use strict";
  6. // Simulations show these probabilities for a single change
  7. // 93.1% that one group is invalidated
  8. // 4.8% that two groups are invalidated
  9. // 1.1% that 3 groups are invalidated
  10. // 0.1% that 4 or more groups are invalidated
  11. //
  12. // And these for removing/adding 10 lexically adjacent files
  13. // 64.5% that one group is invalidated
  14. // 24.8% that two groups are invalidated
  15. // 7.8% that 3 groups are invalidated
  16. // 2.7% that 4 or more groups are invalidated
  17. //
  18. // And these for removing/adding 3 random files
  19. // 0% that one group is invalidated
  20. // 3.7% that two groups are invalidated
  21. // 80.8% that 3 groups are invalidated
  22. // 12.3% that 4 groups are invalidated
  23. // 3.2% that 5 or more groups are invalidated
  24. /**
  25. * @param {string} a key
  26. * @param {string} b key
  27. * @returns {number} the similarity as number
  28. */
  29. const similarity = (a, b) => {
  30. const l = Math.min(a.length, b.length);
  31. let dist = 0;
  32. for (let i = 0; i < l; i++) {
  33. const ca = a.charCodeAt(i);
  34. const cb = b.charCodeAt(i);
  35. dist += Math.max(0, 10 - Math.abs(ca - cb));
  36. }
  37. return dist;
  38. };
  39. /**
  40. * @param {string} a key
  41. * @param {string} b key
  42. * @param {Set<string>} usedNames set of already used names
  43. * @returns {string} the common part and a single char for the difference
  44. */
  45. const getName = (a, b, usedNames) => {
  46. const l = Math.min(a.length, b.length);
  47. let i = 0;
  48. while (i < l) {
  49. if (a.charCodeAt(i) !== b.charCodeAt(i)) {
  50. i++;
  51. break;
  52. }
  53. i++;
  54. }
  55. while (i < l) {
  56. const name = a.slice(0, i);
  57. const lowerName = name.toLowerCase();
  58. if (!usedNames.has(lowerName)) {
  59. usedNames.add(lowerName);
  60. return name;
  61. }
  62. i++;
  63. }
  64. // names always contain a hash, so this is always unique
  65. // we don't need to check usedNames nor add it
  66. return a;
  67. };
  68. /** @typedef {Record<string, number>} Sizes */
  69. /**
  70. * @param {Sizes} total total size
  71. * @param {Sizes} size single size
  72. * @returns {void}
  73. */
  74. const addSizeTo = (total, size) => {
  75. for (const key of Object.keys(size)) {
  76. total[key] = (total[key] || 0) + size[key];
  77. }
  78. };
  79. /**
  80. * @param {Sizes} total total size
  81. * @param {Sizes} size single size
  82. * @returns {void}
  83. */
  84. const subtractSizeFrom = (total, size) => {
  85. for (const key of Object.keys(size)) {
  86. total[key] -= size[key];
  87. }
  88. };
  89. /**
  90. * @template T
  91. * @param {Iterable<Node<T>>} nodes some nodes
  92. * @returns {Sizes} total size
  93. */
  94. const sumSize = (nodes) => {
  95. /** @type {Sizes} */
  96. const sum = Object.create(null);
  97. for (const node of nodes) {
  98. addSizeTo(sum, node.size);
  99. }
  100. return sum;
  101. };
  102. /**
  103. * @param {Sizes} size size
  104. * @param {Sizes} maxSize minimum size
  105. * @returns {boolean} true, when size is too big
  106. */
  107. const isTooBig = (size, maxSize) => {
  108. for (const key of Object.keys(size)) {
  109. const s = size[key];
  110. if (s === 0) continue;
  111. const maxSizeValue = maxSize[key];
  112. if (typeof maxSizeValue === "number" && s > maxSizeValue) return true;
  113. }
  114. return false;
  115. };
  116. /**
  117. * @param {Sizes} size size
  118. * @param {Sizes} minSize minimum size
  119. * @returns {boolean} true, when size is too small
  120. */
  121. const isTooSmall = (size, minSize) => {
  122. for (const key of Object.keys(size)) {
  123. const s = size[key];
  124. if (s === 0) continue;
  125. const minSizeValue = minSize[key];
  126. if (typeof minSizeValue === "number" && s < minSizeValue) return true;
  127. }
  128. return false;
  129. };
  130. /** @typedef {Set<string>} Types */
  131. /**
  132. * @param {Sizes} size size
  133. * @param {Sizes} minSize minimum size
  134. * @returns {Types} set of types that are too small
  135. */
  136. const getTooSmallTypes = (size, minSize) => {
  137. /** @type {Types} */
  138. const types = new Set();
  139. for (const key of Object.keys(size)) {
  140. const s = size[key];
  141. if (s === 0) continue;
  142. const minSizeValue = minSize[key];
  143. if (typeof minSizeValue === "number" && s < minSizeValue) types.add(key);
  144. }
  145. return types;
  146. };
  147. /**
  148. * @template {object} T
  149. * @param {T} size size
  150. * @param {Types} types types
  151. * @returns {number} number of matching size types
  152. */
  153. const getNumberOfMatchingSizeTypes = (size, types) => {
  154. let i = 0;
  155. for (const key of Object.keys(size)) {
  156. if (size[/** @type {keyof T} */ (key)] !== 0 && types.has(key)) i++;
  157. }
  158. return i;
  159. };
  160. /**
  161. * @param {Sizes} size size
  162. * @param {Types} types types
  163. * @returns {number} selective size sum
  164. */
  165. const selectiveSizeSum = (size, types) => {
  166. let sum = 0;
  167. for (const key of Object.keys(size)) {
  168. if (size[key] !== 0 && types.has(key)) sum += size[key];
  169. }
  170. return sum;
  171. };
  172. /**
  173. * @template T
  174. */
  175. class Node {
  176. /**
  177. * @param {T} item item
  178. * @param {string} key key
  179. * @param {Sizes} size size
  180. */
  181. constructor(item, key, size) {
  182. this.item = item;
  183. this.key = key;
  184. this.size = size;
  185. }
  186. }
  187. /** @typedef {number[]} Similarities */
  188. /**
  189. * @template T
  190. */
  191. class Group {
  192. /**
  193. * @param {Node<T>[]} nodes nodes
  194. * @param {Similarities | null} similarities similarities between the nodes (length = nodes.length - 1)
  195. * @param {Sizes=} size size of the group
  196. */
  197. constructor(nodes, similarities, size) {
  198. this.nodes = nodes;
  199. this.similarities = similarities;
  200. this.size = size || sumSize(nodes);
  201. /** @type {string | undefined} */
  202. this.key = undefined;
  203. }
  204. /**
  205. * @param {(node: Node<T>) => boolean} filter filter function
  206. * @returns {Node<T>[] | undefined} removed nodes
  207. */
  208. popNodes(filter) {
  209. /** @type {Node<T>[]} */
  210. const newNodes = [];
  211. /** @type {Similarities} */
  212. const newSimilarities = [];
  213. /** @type {Node<T>[]} */
  214. const resultNodes = [];
  215. /** @type {undefined | Node<T>} */
  216. let lastNode;
  217. for (let i = 0; i < this.nodes.length; i++) {
  218. const node = this.nodes[i];
  219. if (filter(node)) {
  220. resultNodes.push(node);
  221. } else {
  222. if (newNodes.length > 0) {
  223. newSimilarities.push(
  224. lastNode === this.nodes[i - 1]
  225. ? /** @type {Similarities} */ (this.similarities)[i - 1]
  226. : similarity(/** @type {Node<T>} */ (lastNode).key, node.key)
  227. );
  228. }
  229. newNodes.push(node);
  230. lastNode = node;
  231. }
  232. }
  233. if (resultNodes.length === this.nodes.length) return;
  234. this.nodes = newNodes;
  235. this.similarities = newSimilarities;
  236. this.size = sumSize(newNodes);
  237. return resultNodes;
  238. }
  239. }
  240. /**
  241. * @template T
  242. * @param {Iterable<Node<T>>} nodes nodes
  243. * @returns {Similarities} similarities
  244. */
  245. const getSimilarities = (nodes) => {
  246. // calculate similarities between lexically adjacent nodes
  247. /** @type {Similarities} */
  248. const similarities = [];
  249. /** @type {undefined | Node<T>} */
  250. let last;
  251. for (const node of nodes) {
  252. if (last !== undefined) {
  253. similarities.push(similarity(last.key, node.key));
  254. }
  255. last = node;
  256. }
  257. return similarities;
  258. };
  259. /**
  260. * @template T
  261. * @typedef {object} GroupedItems<T>
  262. * @property {string} key
  263. * @property {T[]} items
  264. * @property {Sizes} size
  265. */
  266. /**
  267. * @template T
  268. * @typedef {object} Options
  269. * @property {Sizes} maxSize maximum size of a group
  270. * @property {Sizes} minSize minimum size of a group (preferred over maximum size)
  271. * @property {Iterable<T>} items a list of items
  272. * @property {(item: T) => Sizes} getSize function to get size of an item
  273. * @property {(item: T) => string} getKey function to get the key of an item
  274. */
  275. /**
  276. * @template T
  277. * @param {Options<T>} options options object
  278. * @returns {GroupedItems<T>[]} grouped items
  279. */
  280. module.exports = ({ maxSize, minSize, items, getSize, getKey }) => {
  281. /** @type {Group<T>[]} */
  282. const result = [];
  283. const nodes = Array.from(
  284. items,
  285. (item) => new Node(item, getKey(item), getSize(item))
  286. );
  287. /** @type {Node<T>[]} */
  288. const initialNodes = [];
  289. // lexically ordering of keys
  290. nodes.sort((a, b) => {
  291. if (a.key < b.key) return -1;
  292. if (a.key > b.key) return 1;
  293. return 0;
  294. });
  295. // return nodes bigger than maxSize directly as group
  296. // But make sure that minSize is not violated
  297. for (const node of nodes) {
  298. if (isTooBig(node.size, maxSize) && !isTooSmall(node.size, minSize)) {
  299. result.push(new Group([node], []));
  300. } else {
  301. initialNodes.push(node);
  302. }
  303. }
  304. if (initialNodes.length > 0) {
  305. const initialGroup = new Group(initialNodes, getSimilarities(initialNodes));
  306. /**
  307. * @param {Group<T>} group group
  308. * @param {Sizes} consideredSize size of the group to consider
  309. * @returns {boolean} true, if the group was modified
  310. */
  311. const removeProblematicNodes = (group, consideredSize = group.size) => {
  312. const problemTypes = getTooSmallTypes(consideredSize, minSize);
  313. if (problemTypes.size > 0) {
  314. // We hit an edge case where the working set is already smaller than minSize
  315. // We merge problematic nodes with the smallest result node to keep minSize intact
  316. const problemNodes = group.popNodes(
  317. (n) => getNumberOfMatchingSizeTypes(n.size, problemTypes) > 0
  318. );
  319. if (problemNodes === undefined) return false;
  320. // Only merge it with result nodes that have the problematic size type
  321. const possibleResultGroups = result.filter(
  322. (n) => getNumberOfMatchingSizeTypes(n.size, problemTypes) > 0
  323. );
  324. if (possibleResultGroups.length > 0) {
  325. const bestGroup = possibleResultGroups.reduce((min, group) => {
  326. const minMatches = getNumberOfMatchingSizeTypes(min, problemTypes);
  327. const groupMatches = getNumberOfMatchingSizeTypes(
  328. group,
  329. problemTypes
  330. );
  331. if (minMatches !== groupMatches) {
  332. return minMatches < groupMatches ? group : min;
  333. }
  334. if (
  335. selectiveSizeSum(min.size, problemTypes) >
  336. selectiveSizeSum(group.size, problemTypes)
  337. ) {
  338. return group;
  339. }
  340. return min;
  341. });
  342. for (const node of problemNodes) bestGroup.nodes.push(node);
  343. bestGroup.nodes.sort((a, b) => {
  344. if (a.key < b.key) return -1;
  345. if (a.key > b.key) return 1;
  346. return 0;
  347. });
  348. } else {
  349. // There are no other nodes with the same size types
  350. // We create a new group and have to accept that it's smaller than minSize
  351. result.push(new Group(problemNodes, null));
  352. }
  353. return true;
  354. }
  355. return false;
  356. };
  357. if (initialGroup.nodes.length > 0) {
  358. const queue = [initialGroup];
  359. while (queue.length) {
  360. const group = /** @type {Group<T>} */ (queue.pop());
  361. // only groups bigger than maxSize need to be splitted
  362. if (!isTooBig(group.size, maxSize)) {
  363. result.push(group);
  364. continue;
  365. }
  366. // If the group is already too small
  367. // we try to work only with the unproblematic nodes
  368. if (removeProblematicNodes(group)) {
  369. // This changed something, so we try this group again
  370. queue.push(group);
  371. continue;
  372. }
  373. // find unsplittable area from left and right
  374. // going minSize from left and right
  375. // at least one node need to be included otherwise we get stuck
  376. let left = 1;
  377. /** @type {Sizes} */
  378. const leftSize = Object.create(null);
  379. addSizeTo(leftSize, group.nodes[0].size);
  380. while (left < group.nodes.length && isTooSmall(leftSize, minSize)) {
  381. addSizeTo(leftSize, group.nodes[left].size);
  382. left++;
  383. }
  384. let right = group.nodes.length - 2;
  385. /** @type {Sizes} */
  386. const rightSize = Object.create(null);
  387. addSizeTo(rightSize, group.nodes[group.nodes.length - 1].size);
  388. while (right >= 0 && isTooSmall(rightSize, minSize)) {
  389. addSizeTo(rightSize, group.nodes[right].size);
  390. right--;
  391. }
  392. // left v v right
  393. // [ O O O ] O O O [ O O O ]
  394. // ^^^^^^^^^ leftSize
  395. // rightSize ^^^^^^^^^
  396. // leftSize > minSize
  397. // rightSize > minSize
  398. // Perfect split: [ O O O ] [ O O O ]
  399. // right === left - 1
  400. if (left - 1 > right) {
  401. // We try to remove some problematic nodes to "fix" that
  402. /** @type {Sizes} */
  403. let prevSize;
  404. if (right < group.nodes.length - left) {
  405. subtractSizeFrom(rightSize, group.nodes[right + 1].size);
  406. prevSize = rightSize;
  407. } else {
  408. subtractSizeFrom(leftSize, group.nodes[left - 1].size);
  409. prevSize = leftSize;
  410. }
  411. if (removeProblematicNodes(group, prevSize)) {
  412. // This changed something, so we try this group again
  413. queue.push(group);
  414. continue;
  415. }
  416. // can't split group while holding minSize
  417. // because minSize is preferred of maxSize we return
  418. // the problematic nodes as result here even while it's too big
  419. // To avoid this make sure maxSize > minSize * 3
  420. result.push(group);
  421. continue;
  422. }
  423. if (left <= right) {
  424. // when there is a area between left and right
  425. // we look for best split point
  426. // we split at the minimum similarity
  427. // here key space is separated the most
  428. // But we also need to make sure to not create too small groups
  429. let best = -1;
  430. let bestSimilarity = Infinity;
  431. let pos = left;
  432. const rightSize = sumSize(group.nodes.slice(pos));
  433. // pos v v right
  434. // [ O O O ] O O O [ O O O ]
  435. // ^^^^^^^^^ leftSize
  436. // rightSize ^^^^^^^^^^^^^^^
  437. while (pos <= right + 1) {
  438. const similarity =
  439. /** @type {Similarities} */
  440. (group.similarities)[pos - 1];
  441. if (
  442. similarity < bestSimilarity &&
  443. !isTooSmall(leftSize, minSize) &&
  444. !isTooSmall(rightSize, minSize)
  445. ) {
  446. best = pos;
  447. bestSimilarity = similarity;
  448. }
  449. addSizeTo(leftSize, group.nodes[pos].size);
  450. subtractSizeFrom(rightSize, group.nodes[pos].size);
  451. pos++;
  452. }
  453. if (best < 0) {
  454. // This can't happen
  455. // but if that assumption is wrong
  456. // fallback to a big group
  457. result.push(group);
  458. continue;
  459. }
  460. left = best;
  461. right = best - 1;
  462. }
  463. // create two new groups for left and right area
  464. // and queue them up
  465. /** @type {Node<T>[]} */
  466. const rightNodes = [group.nodes[right + 1]];
  467. /** @type {Similarities} */
  468. const rightSimilarities = [];
  469. for (let i = right + 2; i < group.nodes.length; i++) {
  470. rightSimilarities.push(
  471. /** @type {Similarities} */ (group.similarities)[i - 1]
  472. );
  473. rightNodes.push(group.nodes[i]);
  474. }
  475. queue.push(new Group(rightNodes, rightSimilarities));
  476. /** @type {Node<T>[]} */
  477. const leftNodes = [group.nodes[0]];
  478. /** @type {Similarities} */
  479. const leftSimilarities = [];
  480. for (let i = 1; i < left; i++) {
  481. leftSimilarities.push(
  482. /** @type {Similarities} */ (group.similarities)[i - 1]
  483. );
  484. leftNodes.push(group.nodes[i]);
  485. }
  486. queue.push(new Group(leftNodes, leftSimilarities));
  487. }
  488. }
  489. }
  490. // lexically ordering
  491. result.sort((a, b) => {
  492. if (a.nodes[0].key < b.nodes[0].key) return -1;
  493. if (a.nodes[0].key > b.nodes[0].key) return 1;
  494. return 0;
  495. });
  496. // give every group a name
  497. /** @type {Set<string>} */
  498. const usedNames = new Set();
  499. for (let i = 0; i < result.length; i++) {
  500. const group = result[i];
  501. if (group.nodes.length === 1) {
  502. group.key = group.nodes[0].key;
  503. } else {
  504. const first = group.nodes[0];
  505. const last = group.nodes[group.nodes.length - 1];
  506. const name = getName(first.key, last.key, usedNames);
  507. group.key = name;
  508. }
  509. }
  510. // return the results
  511. return result.map(
  512. (group) =>
  513. /** @type {GroupedItems<T>} */
  514. ({
  515. key: group.key,
  516. items: group.nodes.map((node) => node.item),
  517. size: group.size
  518. })
  519. );
  520. };