deterministicGrouping.js 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549
  1. /*
  2. MIT License http://www.opensource.org/licenses/mit-license.php
  3. Author Tobias Koppers @sokra
  4. */
  5. "use strict";
  6. // Simulations show these probabilities for a single change
  7. // 93.1% that one group is invalidated
  8. // 4.8% that two groups are invalidated
  9. // 1.1% that 3 groups are invalidated
  10. // 0.1% that 4 or more groups are invalidated
  11. //
  12. // And these for removing/adding 10 lexically adjacent files
  13. // 64.5% that one group is invalidated
  14. // 24.8% that two groups are invalidated
  15. // 7.8% that 3 groups are invalidated
  16. // 2.7% that 4 or more groups are invalidated
  17. //
  18. // And these for removing/adding 3 random files
  19. // 0% that one group is invalidated
  20. // 3.7% that two groups are invalidated
  21. // 80.8% that 3 groups are invalidated
  22. // 12.3% that 4 groups are invalidated
  23. // 3.2% that 5 or more groups are invalidated
  24. /**
  25. * @param {string} a key
  26. * @param {string} b key
  27. * @returns {number} the similarity as number
  28. */
  29. const similarity = (a, b) => {
  30. const l = Math.min(a.length, b.length);
  31. let dist = 0;
  32. for (let i = 0; i < l; i++) {
  33. const ca = a.charCodeAt(i);
  34. const cb = b.charCodeAt(i);
  35. dist += Math.max(0, 10 - Math.abs(ca - cb));
  36. }
  37. return dist;
  38. };
  39. /**
  40. * @param {string} a key
  41. * @param {string} b key
  42. * @param {Set<string>} usedNames set of already used names
  43. * @returns {string} the common part and a single char for the difference
  44. */
  45. const getName = (a, b, usedNames) => {
  46. const l = Math.min(a.length, b.length);
  47. let i = 0;
  48. while (i < l) {
  49. if (a.charCodeAt(i) !== b.charCodeAt(i)) {
  50. i++;
  51. break;
  52. }
  53. i++;
  54. }
  55. while (i < l) {
  56. const name = a.slice(0, i);
  57. const lowerName = name.toLowerCase();
  58. if (!usedNames.has(lowerName)) {
  59. usedNames.add(lowerName);
  60. return name;
  61. }
  62. i++;
  63. }
  64. // names always contain a hash, so this is always unique
  65. // we don't need to check usedNames nor add it
  66. return a;
  67. };
  68. /** @typedef {Record<string, number>} Sizes */
  69. /**
  70. * @param {Sizes} total total size
  71. * @param {Sizes} size single size
  72. * @returns {void}
  73. */
  74. const addSizeTo = (total, size) => {
  75. for (const key of Object.keys(size)) {
  76. total[key] = (total[key] || 0) + size[key];
  77. }
  78. };
  79. /**
  80. * @param {Sizes} total total size
  81. * @param {Sizes} size single size
  82. * @returns {void}
  83. */
  84. const subtractSizeFrom = (total, size) => {
  85. for (const key of Object.keys(size)) {
  86. total[key] -= size[key];
  87. }
  88. };
  89. /**
  90. * @template T
  91. * @param {Iterable<Node<T>>} nodes some nodes
  92. * @returns {Sizes} total size
  93. */
  94. const sumSize = (nodes) => {
  95. const sum = Object.create(null);
  96. for (const node of nodes) {
  97. addSizeTo(sum, node.size);
  98. }
  99. return sum;
  100. };
  101. /**
  102. * @param {Sizes} size size
  103. * @param {Sizes} maxSize minimum size
  104. * @returns {boolean} true, when size is too big
  105. */
  106. const isTooBig = (size, maxSize) => {
  107. for (const key of Object.keys(size)) {
  108. const s = size[key];
  109. if (s === 0) continue;
  110. const maxSizeValue = maxSize[key];
  111. if (typeof maxSizeValue === "number" && s > maxSizeValue) return true;
  112. }
  113. return false;
  114. };
  115. /**
  116. * @param {Sizes} size size
  117. * @param {Sizes} minSize minimum size
  118. * @returns {boolean} true, when size is too small
  119. */
  120. const isTooSmall = (size, minSize) => {
  121. for (const key of Object.keys(size)) {
  122. const s = size[key];
  123. if (s === 0) continue;
  124. const minSizeValue = minSize[key];
  125. if (typeof minSizeValue === "number" && s < minSizeValue) return true;
  126. }
  127. return false;
  128. };
  129. /** @typedef {Set<string>} Types */
  130. /**
  131. * @param {Sizes} size size
  132. * @param {Sizes} minSize minimum size
  133. * @returns {Types} set of types that are too small
  134. */
  135. const getTooSmallTypes = (size, minSize) => {
  136. /** @typedef {Types} */
  137. const types = new Set();
  138. for (const key of Object.keys(size)) {
  139. const s = size[key];
  140. if (s === 0) continue;
  141. const minSizeValue = minSize[key];
  142. if (typeof minSizeValue === "number" && s < minSizeValue) types.add(key);
  143. }
  144. return types;
  145. };
  146. /**
  147. * @template {object} T
  148. * @param {T} size size
  149. * @param {Types} types types
  150. * @returns {number} number of matching size types
  151. */
  152. const getNumberOfMatchingSizeTypes = (size, types) => {
  153. let i = 0;
  154. for (const key of Object.keys(size)) {
  155. if (size[/** @type {keyof T} */ (key)] !== 0 && types.has(key)) i++;
  156. }
  157. return i;
  158. };
  159. /**
  160. * @param {Sizes} size size
  161. * @param {Types} types types
  162. * @returns {number} selective size sum
  163. */
  164. const selectiveSizeSum = (size, types) => {
  165. let sum = 0;
  166. for (const key of Object.keys(size)) {
  167. if (size[key] !== 0 && types.has(key)) sum += size[key];
  168. }
  169. return sum;
  170. };
  171. /**
  172. * @template T
  173. */
  174. class Node {
  175. /**
  176. * @param {T} item item
  177. * @param {string} key key
  178. * @param {Sizes} size size
  179. */
  180. constructor(item, key, size) {
  181. this.item = item;
  182. this.key = key;
  183. this.size = size;
  184. }
  185. }
  186. /** @typedef {number[]} Similarities */
  187. /**
  188. * @template T
  189. */
  190. class Group {
  191. /**
  192. * @param {Node<T>[]} nodes nodes
  193. * @param {Similarities | null} similarities similarities between the nodes (length = nodes.length - 1)
  194. * @param {Sizes=} size size of the group
  195. */
  196. constructor(nodes, similarities, size) {
  197. this.nodes = nodes;
  198. this.similarities = similarities;
  199. this.size = size || sumSize(nodes);
  200. /** @type {string | undefined} */
  201. this.key = undefined;
  202. }
  203. /**
  204. * @param {(node: Node<T>) => boolean} filter filter function
  205. * @returns {Node<T>[] | undefined} removed nodes
  206. */
  207. popNodes(filter) {
  208. const newNodes = [];
  209. const newSimilarities = [];
  210. const resultNodes = [];
  211. let lastNode;
  212. for (let i = 0; i < this.nodes.length; i++) {
  213. const node = this.nodes[i];
  214. if (filter(node)) {
  215. resultNodes.push(node);
  216. } else {
  217. if (newNodes.length > 0) {
  218. newSimilarities.push(
  219. lastNode === this.nodes[i - 1]
  220. ? /** @type {Similarities} */ (this.similarities)[i - 1]
  221. : similarity(/** @type {Node<T>} */ (lastNode).key, node.key)
  222. );
  223. }
  224. newNodes.push(node);
  225. lastNode = node;
  226. }
  227. }
  228. if (resultNodes.length === this.nodes.length) return;
  229. this.nodes = newNodes;
  230. this.similarities = newSimilarities;
  231. this.size = sumSize(newNodes);
  232. return resultNodes;
  233. }
  234. }
  235. /**
  236. * @template T
  237. * @param {Iterable<Node<T>>} nodes nodes
  238. * @returns {Similarities} similarities
  239. */
  240. const getSimilarities = (nodes) => {
  241. // calculate similarities between lexically adjacent nodes
  242. /** @type {Similarities} */
  243. const similarities = [];
  244. let last;
  245. for (const node of nodes) {
  246. if (last !== undefined) {
  247. similarities.push(similarity(last.key, node.key));
  248. }
  249. last = node;
  250. }
  251. return similarities;
  252. };
  253. /**
  254. * @template T
  255. * @typedef {object} GroupedItems<T>
  256. * @property {string} key
  257. * @property {T[]} items
  258. * @property {Sizes} size
  259. */
  260. /**
  261. * @template T
  262. * @typedef {object} Options
  263. * @property {Sizes} maxSize maximum size of a group
  264. * @property {Sizes} minSize minimum size of a group (preferred over maximum size)
  265. * @property {Iterable<T>} items a list of items
  266. * @property {(item: T) => Sizes} getSize function to get size of an item
  267. * @property {(item: T) => string} getKey function to get the key of an item
  268. */
  269. /**
  270. * @template T
  271. * @param {Options<T>} options options object
  272. * @returns {GroupedItems<T>[]} grouped items
  273. */
  274. module.exports = ({ maxSize, minSize, items, getSize, getKey }) => {
  275. /** @type {Group<T>[]} */
  276. const result = [];
  277. const nodes = Array.from(
  278. items,
  279. (item) => new Node(item, getKey(item), getSize(item))
  280. );
  281. /** @type {Node<T>[]} */
  282. const initialNodes = [];
  283. // lexically ordering of keys
  284. nodes.sort((a, b) => {
  285. if (a.key < b.key) return -1;
  286. if (a.key > b.key) return 1;
  287. return 0;
  288. });
  289. // return nodes bigger than maxSize directly as group
  290. // But make sure that minSize is not violated
  291. for (const node of nodes) {
  292. if (isTooBig(node.size, maxSize) && !isTooSmall(node.size, minSize)) {
  293. result.push(new Group([node], []));
  294. } else {
  295. initialNodes.push(node);
  296. }
  297. }
  298. if (initialNodes.length > 0) {
  299. const initialGroup = new Group(initialNodes, getSimilarities(initialNodes));
  300. /**
  301. * @param {Group<T>} group group
  302. * @param {Sizes} consideredSize size of the group to consider
  303. * @returns {boolean} true, if the group was modified
  304. */
  305. const removeProblematicNodes = (group, consideredSize = group.size) => {
  306. const problemTypes = getTooSmallTypes(consideredSize, minSize);
  307. if (problemTypes.size > 0) {
  308. // We hit an edge case where the working set is already smaller than minSize
  309. // We merge problematic nodes with the smallest result node to keep minSize intact
  310. const problemNodes = group.popNodes(
  311. (n) => getNumberOfMatchingSizeTypes(n.size, problemTypes) > 0
  312. );
  313. if (problemNodes === undefined) return false;
  314. // Only merge it with result nodes that have the problematic size type
  315. const possibleResultGroups = result.filter(
  316. (n) => getNumberOfMatchingSizeTypes(n.size, problemTypes) > 0
  317. );
  318. if (possibleResultGroups.length > 0) {
  319. const bestGroup = possibleResultGroups.reduce((min, group) => {
  320. const minMatches = getNumberOfMatchingSizeTypes(min, problemTypes);
  321. const groupMatches = getNumberOfMatchingSizeTypes(
  322. group,
  323. problemTypes
  324. );
  325. if (minMatches !== groupMatches) {
  326. return minMatches < groupMatches ? group : min;
  327. }
  328. if (
  329. selectiveSizeSum(min.size, problemTypes) >
  330. selectiveSizeSum(group.size, problemTypes)
  331. ) {
  332. return group;
  333. }
  334. return min;
  335. });
  336. for (const node of problemNodes) bestGroup.nodes.push(node);
  337. bestGroup.nodes.sort((a, b) => {
  338. if (a.key < b.key) return -1;
  339. if (a.key > b.key) return 1;
  340. return 0;
  341. });
  342. } else {
  343. // There are no other nodes with the same size types
  344. // We create a new group and have to accept that it's smaller than minSize
  345. result.push(new Group(problemNodes, null));
  346. }
  347. return true;
  348. }
  349. return false;
  350. };
  351. if (initialGroup.nodes.length > 0) {
  352. const queue = [initialGroup];
  353. while (queue.length) {
  354. const group = /** @type {Group<T>} */ (queue.pop());
  355. // only groups bigger than maxSize need to be splitted
  356. if (!isTooBig(group.size, maxSize)) {
  357. result.push(group);
  358. continue;
  359. }
  360. // If the group is already too small
  361. // we try to work only with the unproblematic nodes
  362. if (removeProblematicNodes(group)) {
  363. // This changed something, so we try this group again
  364. queue.push(group);
  365. continue;
  366. }
  367. // find unsplittable area from left and right
  368. // going minSize from left and right
  369. // at least one node need to be included otherwise we get stuck
  370. let left = 1;
  371. const leftSize = Object.create(null);
  372. addSizeTo(leftSize, group.nodes[0].size);
  373. while (left < group.nodes.length && isTooSmall(leftSize, minSize)) {
  374. addSizeTo(leftSize, group.nodes[left].size);
  375. left++;
  376. }
  377. let right = group.nodes.length - 2;
  378. const rightSize = Object.create(null);
  379. addSizeTo(rightSize, group.nodes[group.nodes.length - 1].size);
  380. while (right >= 0 && isTooSmall(rightSize, minSize)) {
  381. addSizeTo(rightSize, group.nodes[right].size);
  382. right--;
  383. }
  384. // left v v right
  385. // [ O O O ] O O O [ O O O ]
  386. // ^^^^^^^^^ leftSize
  387. // rightSize ^^^^^^^^^
  388. // leftSize > minSize
  389. // rightSize > minSize
  390. // Perfect split: [ O O O ] [ O O O ]
  391. // right === left - 1
  392. if (left - 1 > right) {
  393. // We try to remove some problematic nodes to "fix" that
  394. let prevSize;
  395. if (right < group.nodes.length - left) {
  396. subtractSizeFrom(rightSize, group.nodes[right + 1].size);
  397. prevSize = rightSize;
  398. } else {
  399. subtractSizeFrom(leftSize, group.nodes[left - 1].size);
  400. prevSize = leftSize;
  401. }
  402. if (removeProblematicNodes(group, prevSize)) {
  403. // This changed something, so we try this group again
  404. queue.push(group);
  405. continue;
  406. }
  407. // can't split group while holding minSize
  408. // because minSize is preferred of maxSize we return
  409. // the problematic nodes as result here even while it's too big
  410. // To avoid this make sure maxSize > minSize * 3
  411. result.push(group);
  412. continue;
  413. }
  414. if (left <= right) {
  415. // when there is a area between left and right
  416. // we look for best split point
  417. // we split at the minimum similarity
  418. // here key space is separated the most
  419. // But we also need to make sure to not create too small groups
  420. let best = -1;
  421. let bestSimilarity = Infinity;
  422. let pos = left;
  423. const rightSize = sumSize(group.nodes.slice(pos));
  424. // pos v v right
  425. // [ O O O ] O O O [ O O O ]
  426. // ^^^^^^^^^ leftSize
  427. // rightSize ^^^^^^^^^^^^^^^
  428. while (pos <= right + 1) {
  429. const similarity =
  430. /** @type {Similarities} */
  431. (group.similarities)[pos - 1];
  432. if (
  433. similarity < bestSimilarity &&
  434. !isTooSmall(leftSize, minSize) &&
  435. !isTooSmall(rightSize, minSize)
  436. ) {
  437. best = pos;
  438. bestSimilarity = similarity;
  439. }
  440. addSizeTo(leftSize, group.nodes[pos].size);
  441. subtractSizeFrom(rightSize, group.nodes[pos].size);
  442. pos++;
  443. }
  444. if (best < 0) {
  445. // This can't happen
  446. // but if that assumption is wrong
  447. // fallback to a big group
  448. result.push(group);
  449. continue;
  450. }
  451. left = best;
  452. right = best - 1;
  453. }
  454. // create two new groups for left and right area
  455. // and queue them up
  456. const rightNodes = [group.nodes[right + 1]];
  457. /** @type {Similarities} */
  458. const rightSimilarities = [];
  459. for (let i = right + 2; i < group.nodes.length; i++) {
  460. rightSimilarities.push(
  461. /** @type {Similarities} */ (group.similarities)[i - 1]
  462. );
  463. rightNodes.push(group.nodes[i]);
  464. }
  465. queue.push(new Group(rightNodes, rightSimilarities));
  466. const leftNodes = [group.nodes[0]];
  467. /** @type {Similarities} */
  468. const leftSimilarities = [];
  469. for (let i = 1; i < left; i++) {
  470. leftSimilarities.push(
  471. /** @type {Similarities} */ (group.similarities)[i - 1]
  472. );
  473. leftNodes.push(group.nodes[i]);
  474. }
  475. queue.push(new Group(leftNodes, leftSimilarities));
  476. }
  477. }
  478. }
  479. // lexically ordering
  480. result.sort((a, b) => {
  481. if (a.nodes[0].key < b.nodes[0].key) return -1;
  482. if (a.nodes[0].key > b.nodes[0].key) return 1;
  483. return 0;
  484. });
  485. // give every group a name
  486. const usedNames = new Set();
  487. for (let i = 0; i < result.length; i++) {
  488. const group = result[i];
  489. if (group.nodes.length === 1) {
  490. group.key = group.nodes[0].key;
  491. } else {
  492. const first = group.nodes[0];
  493. const last = group.nodes[group.nodes.length - 1];
  494. const name = getName(first.key, last.key, usedNames);
  495. group.key = name;
  496. }
  497. }
  498. // return the results
  499. return result.map(
  500. (group) =>
  501. /** @type {GroupedItems<T>} */
  502. ({
  503. key: group.key,
  504. items: group.nodes.map((node) => node.item),
  505. size: group.size
  506. })
  507. );
  508. };