diff_match_patch.h 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628
  1. /*
  2. * Copyright 2008 Google Inc. All Rights Reserved.
  3. * Author: [email protected] (Neil Fraser)
  4. * Author: [email protected] (Mike Slemmer)
  5. *
  6. * Licensed under the Apache License, Version 2.0 (the "License");
  7. * you may not use this file except in compliance with the License.
  8. * You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. *
  18. * Diff Match and Patch
  19. * http://code.google.com/p/google-diff-match-patch/
  20. */
  21. #ifndef DIFF_MATCH_PATCH_H
  22. #define DIFF_MATCH_PATCH_H
  23. /*
  24. * Functions for diff, match and patch.
  25. * Computes the difference between two texts to create a patch.
  26. * Applies the patch onto another text, allowing for errors.
  27. *
  28. * @author [email protected] (Neil Fraser)
  29. *
  30. * Qt/C++ port by [email protected] (Mike Slemmer):
  31. *
  32. * Code known to compile and run with Qt 4.3 through Qt 4.7.
  33. *
  34. * Here is a trivial sample program which works properly when linked with this
  35. * library:
  36. *
  37. #include <QtCore>
  38. #include <QString>
  39. #include <QList>
  40. #include <QMap>
  41. #include <QVariant>
  42. #include "diff_match_patch.h"
  43. int main(int argc, char **argv) {
  44. diff_match_patch dmp;
  45. QString str1 = QString("First string in diff");
  46. QString str2 = QString("Second string in diff");
  47. QString strPatch = dmp.patch_toText(dmp.patch_make(str1, str2));
  48. QPair<QString, QVector<bool> > out
  49. = dmp.patch_apply(dmp.patch_fromText(strPatch), str1);
  50. QString strResult = out.first;
  51. // here, strResult will equal str2 above.
  52. return 0;
  53. }
  54. */
  55. /**-
  56. * The data structure representing a diff is a Linked list of Diff objects:
  57. * {Diff(Operation.DELETE, "Hello"), Diff(Operation.INSERT, "Goodbye"),
  58. * Diff(Operation.EQUAL, " world.")}
  59. * which means: delete "Hello", add "Goodbye" and keep " world."
  60. */
  61. enum Operation {
  62. DELETE, INSERT, EQUAL
  63. };
  64. /**
  65. * Class representing one diff operation.
  66. */
  67. class Diff {
  68. public:
  69. Operation operation;
  70. // One of: INSERT, DELETE or EQUAL.
  71. QString text;
  72. // The text associated with this diff operation.
  73. /**
  74. * Constructor. Initializes the diff with the provided values.
  75. * @param operation One of INSERT, DELETE or EQUAL.
  76. * @param text The text being applied.
  77. */
  78. Diff(Operation _operation, const QString &_text);
  79. Diff();
  80. inline bool isNull() const;
  81. QString toString() const;
  82. bool operator==(const Diff &d) const;
  83. bool operator!=(const Diff &d) const;
  84. static QString strOperation(Operation op);
  85. };
  86. /**
  87. * Class representing one patch operation.
  88. */
  89. class Patch {
  90. public:
  91. QList<Diff> diffs;
  92. int start1;
  93. int start2;
  94. int length1;
  95. int length2;
  96. /**
  97. * Constructor. Initializes with an empty list of diffs.
  98. */
  99. Patch();
  100. bool isNull() const;
  101. QString toString()C;
  102. };
  103. /**
  104. * Class containing the diff, match and patch methods.
  105. * Also contains the behaviour settings.
  106. */
  107. class diff_match_patch {
  108. friend class diff_match_patch_test;
  109. public:
  110. // Defaults.
  111. // Set these on your diff_match_patch instance to override the defaults.
  112. // Number of seconds to map a diff before giving up (0 for infinity).
  113. float Diff_Timeout;
  114. // Cost of an empty edit operation in terms of edit characters.
  115. short Diff_EditCost;
  116. // At what point is no match declared (0.0 = perfection, 1.0 = very loose).
  117. float Match_Threshold;
  118. // How far to search for a match (0 = exact location, 1000+ = broad match).
  119. // A match this many characters away from the expected location will add
  120. // 1.0 to the score (0.0 is a perfect match).
  121. int Match_Distance;
  122. // When deleting a large block of text (over ~64 characters), how close does
  123. // the contents have to match the expected contents. (0.0 = perfection,
  124. // 1.0 = very loose). Note that Match_Threshold controls how closely the
  125. // end points of a delete need to match.
  126. float Patch_DeleteThreshold;
  127. // Chunk size for context length.
  128. short Patch_Margin;
  129. // The number of bits in an int.
  130. short Match_MaxBits;
  131. private:
  132. // Define some regex patterns for matching boundaries.
  133. //static QRegExp BLANKLINEEND;
  134. //static QRegExp BLANKLINESTART;
  135. public:
  136. diff_match_patch();
  137. // DIFF FUNCTIONS
  138. /**
  139. * Find the differences between two texts.
  140. * Run a faster slightly less optimal diff.
  141. * This method allows the 'checklines' of diff_main() to be optional.
  142. * Most of the time checklines is wanted, so default to true.
  143. * @param text1 Old string to be diffed.
  144. * @param text2 New string to be diffed.
  145. * @return Linked List of Diff objects.
  146. */
  147. QList<Diff> diff_main(const QString &text1, const QString &text2);
  148. /**
  149. * Find the differences between two texts.
  150. * @param text1 Old string to be diffed.
  151. * @param text2 New string to be diffed.
  152. * @param checklines Speedup flag. If false, then don't run a
  153. * line-level diff first to identify the changed areas.
  154. * If true, then run a faster slightly less optimal diff.
  155. * @return Linked List of Diff objects.
  156. */
  157. QList<Diff> diff_main(const QString &text1, const QString &text2, bool checklines);
  158. /**
  159. * Find the differences between two texts. Simplifies the problem by
  160. * stripping any common prefix or suffix off the texts before diffing.
  161. * @param text1 Old string to be diffed.
  162. * @param text2 New string to be diffed.
  163. * @param checklines Speedup flag. If false, then don't run a
  164. * line-level diff first to identify the changed areas.
  165. * If true, then run a faster slightly less optimal diff.
  166. * @param deadline Time when the diff should be complete by. Used
  167. * internally for recursive calls. Users should set DiffTimeout instead.
  168. * @return Linked List of Diff objects.
  169. */
  170. private:
  171. QList<Diff> diff_main(const QString &text1, const QString &text2, bool checklines, clock_t deadline);
  172. /**
  173. * Find the differences between two texts. Assumes that the texts do not
  174. * have any common prefix or suffix.
  175. * @param text1 Old string to be diffed.
  176. * @param text2 New string to be diffed.
  177. * @param checklines Speedup flag. If false, then don't run a
  178. * line-level diff first to identify the changed areas.
  179. * If true, then run a faster slightly less optimal diff.
  180. * @param deadline Time when the diff should be complete by.
  181. * @return Linked List of Diff objects.
  182. */
  183. private:
  184. QList<Diff> diff_compute(QString text1, QString text2, bool checklines, clock_t deadline);
  185. /**
  186. * Do a quick line-level diff on both strings, then rediff the parts for
  187. * greater accuracy.
  188. * This speedup can produce non-minimal diffs.
  189. * @param text1 Old string to be diffed.
  190. * @param text2 New string to be diffed.
  191. * @param deadline Time when the diff should be complete by.
  192. * @return Linked List of Diff objects.
  193. */
  194. private:
  195. QList<Diff> diff_lineMode(QString text1, QString text2, clock_t deadline);
  196. /**
  197. * Find the 'middle snake' of a diff, split the problem in two
  198. * and return the recursively constructed diff.
  199. * See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
  200. * @param text1 Old string to be diffed.
  201. * @param text2 New string to be diffed.
  202. * @return Linked List of Diff objects.
  203. */
  204. protected:
  205. QList<Diff> diff_bisect(const QString &text1, const QString &text2, clock_t deadline);
  206. /**
  207. * Given the location of the 'middle snake', split the diff in two parts
  208. * and recurse.
  209. * @param text1 Old string to be diffed.
  210. * @param text2 New string to be diffed.
  211. * @param x Index of split point in text1.
  212. * @param y Index of split point in text2.
  213. * @param deadline Time at which to bail if not yet complete.
  214. * @return LinkedList of Diff objects.
  215. */
  216. private:
  217. QList<Diff> diff_bisectSplit(const QString &text1, const QString &text2, int x, int y, clock_t deadline);
  218. /**
  219. * Split two texts into a list of strings. Reduce the texts to a string of
  220. * hashes where each Unicode character represents one line.
  221. * @param text1 First string.
  222. * @param text2 Second string.
  223. * @return Three element Object array, containing the encoded text1, the
  224. * encoded text2 and the List of unique strings. The zeroth element
  225. * of the List of unique strings is intentionally blank.
  226. */
  227. protected:
  228. QList<QVariant> diff_linesToChars(const QString &text1, const QString &text2); // return elems 0 and 1 are QString, elem 2 is QStringList
  229. /**
  230. * Split a text into a list of strings. Reduce the texts to a string of
  231. * hashes where each Unicode character represents one line.
  232. * @param text String to encode.
  233. * @param lineArray List of unique strings.
  234. * @param lineHash Map of strings to indices.
  235. * @return Encoded string.
  236. */
  237. private:
  238. QString diff_linesToCharsMunge(const QString &text, QStringList &lineArray,
  239. QMap<QString, int> &lineHash);
  240. /**
  241. * Rehydrate the text in a diff from a string of line hashes to real lines of
  242. * text.
  243. * @param diffs LinkedList of Diff objects.
  244. * @param lineArray List of unique strings.
  245. */
  246. private:
  247. void diff_charsToLines(QList<Diff> &diffs, const QStringList &lineArray);
  248. /**
  249. * Determine the common prefix of two strings.
  250. * @param text1 First string.
  251. * @param text2 Second string.
  252. * @return The number of characters common to the start of each string.
  253. */
  254. public:
  255. int diff_commonPrefix(const QString &text1, const QString &text2);
  256. /**
  257. * Determine the common suffix of two strings.
  258. * @param text1 First string.
  259. * @param text2 Second string.
  260. * @return The number of characters common to the end of each string.
  261. */
  262. public:
  263. int diff_commonSuffix(const QString &text1, const QString &text2);
  264. /**
  265. * Determine if the suffix of one string is the prefix of another.
  266. * @param text1 First string.
  267. * @param text2 Second string.
  268. * @return The number of characters common to the end of the first
  269. * string and the start of the second string.
  270. */
  271. protected:
  272. int diff_commonOverlap(const QString &text1, const QString &text2);
  273. /**
  274. * Do the two texts share a substring which is at least half the length of
  275. * the longer text?
  276. * This speedup can produce non-minimal diffs.
  277. * @param text1 First string.
  278. * @param text2 Second string.
  279. * @return Five element String array, containing the prefix of text1, the
  280. * suffix of text1, the prefix of text2, the suffix of text2 and the
  281. * common middle. Or null if there was no match.
  282. */
  283. protected:
  284. QStringList diff_halfMatch(const QString &text1, const QString &text2);
  285. /**
  286. * Does a substring of shorttext exist within longtext such that the
  287. * substring is at least half the length of longtext?
  288. * @param longtext Longer string.
  289. * @param shorttext Shorter string.
  290. * @param i Start index of quarter length substring within longtext.
  291. * @return Five element String array, containing the prefix of longtext, the
  292. * suffix of longtext, the prefix of shorttext, the suffix of shorttext
  293. * and the common middle. Or null if there was no match.
  294. */
  295. private:
  296. QStringList diff_halfMatchI(const QString &longtext, const QString &shorttext, int i);
  297. /**
  298. * Reduce the number of edits by eliminating semantically trivial equalities.
  299. * @param diffs LinkedList of Diff objects.
  300. */
  301. public:
  302. void diff_cleanupSemantic(QList<Diff> &diffs);
  303. /**
  304. * Look for single edits surrounded on both sides by equalities
  305. * which can be shifted sideways to align the edit to a word boundary.
  306. * e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
  307. * @param diffs LinkedList of Diff objects.
  308. */
  309. public:
  310. void diff_cleanupSemanticLossless(QList<Diff> &diffs);
  311. /**
  312. * Given two strings, compute a score representing whether the internal
  313. * boundary falls on logical boundaries.
  314. * Scores range from 6 (best) to 0 (worst).
  315. * @param one First string.
  316. * @param two Second string.
  317. * @return The score.
  318. */
  319. private:
  320. int diff_cleanupSemanticScore(const QString &one, const QString &two);
  321. /**
  322. * Reduce the number of edits by eliminating operationally trivial equalities.
  323. * @param diffs LinkedList of Diff objects.
  324. */
  325. public:
  326. void diff_cleanupEfficiency(QList<Diff> &diffs);
  327. /**
  328. * Reorder and merge like edit sections. Merge equalities.
  329. * Any edit section can move as long as it doesn't cross an equality.
  330. * @param diffs LinkedList of Diff objects.
  331. */
  332. public:
  333. void diff_cleanupMerge(QList<Diff> &diffs);
  334. /**
  335. * loc is a location in text1, compute and return the equivalent location in
  336. * text2.
  337. * e.g. "The cat" vs "The big cat", 1->1, 5->8
  338. * @param diffs LinkedList of Diff objects.
  339. * @param loc Location within text1.
  340. * @return Location within text2.
  341. */
  342. public:
  343. int diff_xIndex(const QList<Diff> &diffs, int loc);
  344. /**
  345. * Convert a Diff list into a pretty HTML report.
  346. * @param diffs LinkedList of Diff objects.
  347. * @return HTML representation.
  348. */
  349. public:
  350. QString diff_prettyHtml(const QList<Diff> &diffs);
  351. /**
  352. * Compute and return the source text (all equalities and deletions).
  353. * @param diffs LinkedList of Diff objects.
  354. * @return Source text.
  355. */
  356. public:
  357. QString diff_text1(const QList<Diff> &diffs);
  358. /**
  359. * Compute and return the destination text (all equalities and insertions).
  360. * @param diffs LinkedList of Diff objects.
  361. * @return Destination text.
  362. */
  363. public:
  364. QString diff_text2(const QList<Diff> &diffs);
  365. /**
  366. * Compute the Levenshtein distance; the number of inserted, deleted or
  367. * substituted characters.
  368. * @param diffs LinkedList of Diff objects.
  369. * @return Number of changes.
  370. */
  371. public:
  372. int diff_levenshtein(const QList<Diff> &diffs);
  373. /**
  374. * Crush the diff into an encoded string which describes the operations
  375. * required to transform text1 into text2.
  376. * E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
  377. * Operations are tab-separated. Inserted text is escaped using %xx notation.
  378. * @param diffs Array of diff tuples.
  379. * @return Delta text.
  380. */
  381. public:
  382. QString diff_toDelta(const QList<Diff> &diffs);
  383. /**
  384. * Given the original text1, and an encoded string which describes the
  385. * operations required to transform text1 into text2, compute the full diff.
  386. * @param text1 Source string for the diff.
  387. * @param delta Delta text.
  388. * @return Array of diff tuples or null if invalid.
  389. * @throws QString If invalid input.
  390. */
  391. public:
  392. QList<Diff> diff_fromDelta(const QString &text1, const QString &delta);
  393. // MATCH FUNCTIONS
  394. /**
  395. * Locate the best instance of 'pattern' in 'text' near 'loc'.
  396. * Returns -1 if no match found.
  397. * @param text The text to search.
  398. * @param pattern The pattern to search for.
  399. * @param loc The location to search around.
  400. * @return Best match index or -1.
  401. */
  402. public:
  403. int match_main(const QString &text, const QString &pattern, int loc);
  404. /**
  405. * Locate the best instance of 'pattern' in 'text' near 'loc' using the
  406. * Bitap algorithm. Returns -1 if no match found.
  407. * @param text The text to search.
  408. * @param pattern The pattern to search for.
  409. * @param loc The location to search around.
  410. * @return Best match index or -1.
  411. */
  412. protected:
  413. int match_bitap(const QString &text, const QString &pattern, int loc);
  414. /**
  415. * Compute and return the score for a match with e errors and x location.
  416. * @param e Number of errors in match.
  417. * @param x Location of match.
  418. * @param loc Expected location of match.
  419. * @param pattern Pattern being sought.
  420. * @return Overall score for match (0.0 = good, 1.0 = bad).
  421. */
  422. private:
  423. double match_bitapScore(int e, int x, int loc, const QString &pattern);
  424. /**
  425. * Initialise the alphabet for the Bitap algorithm.
  426. * @param pattern The text to encode.
  427. * @return Hash of character locations.
  428. */
  429. protected:
  430. QMap<QChar, int> match_alphabet(const QString &pattern);
  431. // PATCH FUNCTIONS
  432. /**
  433. * Increase the context until it is unique,
  434. * but don't let the pattern expand beyond Match_MaxBits.
  435. * @param patch The patch to grow.
  436. * @param text Source text.
  437. */
  438. protected:
  439. void patch_addContext(Patch &patch, const QString &text);
  440. /**
  441. * Compute a list of patches to turn text1 into text2.
  442. * A set of diffs will be computed.
  443. * @param text1 Old text.
  444. * @param text2 New text.
  445. * @return LinkedList of Patch objects.
  446. */
  447. public:
  448. QList<Patch> patch_make(const QString &text1, const QString &text2);
  449. /**
  450. * Compute a list of patches to turn text1 into text2.
  451. * text1 will be derived from the provided diffs.
  452. * @param diffs Array of diff tuples for text1 to text2.
  453. * @return LinkedList of Patch objects.
  454. */
  455. public:
  456. QList<Patch> patch_make(const QList<Diff> &diffs);
  457. /**
  458. * Compute a list of patches to turn text1 into text2.
  459. * text2 is ignored, diffs are the delta between text1 and text2.
  460. * @param text1 Old text.
  461. * @param text2 Ignored.
  462. * @param diffs Array of diff tuples for text1 to text2.
  463. * @return LinkedList of Patch objects.
  464. * @deprecated Prefer patch_make(const QString &text1, const QList<Diff> &diffs).
  465. */
  466. public:
  467. QList<Patch> patch_make(const QString &text1, const QString &text2, const QList<Diff> &diffs);
  468. /**
  469. * Compute a list of patches to turn text1 into text2.
  470. * text2 is not provided, diffs are the delta between text1 and text2.
  471. * @param text1 Old text.
  472. * @param diffs Array of diff tuples for text1 to text2.
  473. * @return LinkedList of Patch objects.
  474. */
  475. public:
  476. QList<Patch> patch_make(const QString &text1, const QList<Diff> &diffs);
  477. /**
  478. * Given an array of patches, return another array that is identical.
  479. * @param patches Array of patch objects.
  480. * @return Array of patch objects.
  481. */
  482. public:
  483. QList<Patch> patch_deepCopy(QList<Patch> &patches);
  484. /**
  485. * Merge a set of patches onto the text. Return a patched text, as well
  486. * as an array of true/false values indicating which patches were applied.
  487. * @param patches Array of patch objects.
  488. * @param text Old text.
  489. * @return Two element Object array, containing the new text and an array of
  490. * boolean values.
  491. */
  492. public:
  493. QPair<QString,QVector<bool> > patch_apply(QList<Patch> &patches, const QString &text);
  494. /**
  495. * Add some padding on text start and end so that edges can match something.
  496. * Intended to be called only from within patch_apply.
  497. * @param patches Array of patch objects.
  498. * @return The padding string added to each side.
  499. */
  500. public:
  501. QString patch_addPadding(QList<Patch> &patches);
  502. /**
  503. * Look through the patches and break up any which are longer than the
  504. * maximum limit of the match algorithm.
  505. * Intended to be called only from within patch_apply.
  506. * @param patches LinkedList of Patch objects.
  507. */
  508. public:
  509. void patch_splitMax(QList<Patch> &patches);
  510. /**
  511. * Take a list of patches and return a textual representation.
  512. * @param patches List of Patch objects.
  513. * @return Text representation of patches.
  514. */
  515. public:
  516. QString patch_toText(const QList<Patch> &patches);
  517. /**
  518. * Parse a textual representation of patches and return a List of Patch
  519. * objects.
  520. * @param textline Text representation of patches.
  521. * @return List of Patch objects.
  522. * @throws QString If invalid input.
  523. */
  524. public:
  525. QList<Patch> patch_fromText(const QString &textline);
  526. /**
  527. * A safer version of QString.mid(pos). This one returns "" instead of
  528. * null when the postion equals the string length.
  529. * @param str String to take a substring from.
  530. * @param pos Position to start the substring from.
  531. * @return Substring.
  532. */
  533. private:
  534. static inline QString safeMid(const QString &str, int pos) {
  535. return (pos == str.length()) ? QString("") : str.mid(pos);
  536. }
  537. /**
  538. * A safer version of QString.mid(pos, len). This one returns "" instead of
  539. * null when the postion equals the string length.
  540. * @param str String to take a substring from.
  541. * @param pos Position to start the substring from.
  542. * @param len Length of substring.
  543. * @return Substring.
  544. */
  545. private:
  546. static inline QString safeMid(const QString &str, int pos, int len) {
  547. return (pos == str.length()) ? QString("") : str.mid(pos, len);
  548. }
  549. };
  550. #endif // DIFF_MATCH_PATCH_H