
    i<h                     f   d Z ddlZddlmZmZmZmZ ddlmZ ddddddd	d
dZ	de
de
fdZ	 d:de
de
de
dedee
eee
         ee
         f         f
dZde
deeeef                  de
de
dee
         f
dZde
deeeef                  de
de
fdZde
de
deeeef                  fdZde
de
deeeef                  fdZde
de
deeeef                  fdZde
de
deeeef                  fdZde
de
deeeef                  fdZde
de
deeeef                  fdZde
dee         fd Zd!ee         d"eeeef                  deeeef                  fd#Zde
de
deeeef                  fd$Zde
de
deeeef                  fd%Zde
de
deeeef                  fd&Zd'ee
         d(ed)ed*edeeef         f
d+Zde
d'ee
         d,ee
         de
d-e
deeeef                  fd.Zde
d/e
d0eeeef                  deeeef                  fd1Zd;de
de
d4ed5ede
f
d6Zd7ee
         d8ede
de
de
f
d9Z dS )<aP  
Fuzzy Matching Module for File Operations

Implements a multi-strategy matching chain to robustly find and replace text,
accommodating variations in whitespace, indentation, and escaping common
in LLM-generated code.

The 8-strategy chain (inspired by OpenCode), tried in order:
1. Exact match - Direct string comparison
2. Line-trimmed - Strip leading/trailing whitespace per line
3. Whitespace normalized - Collapse multiple spaces/tabs to single space
4. Indentation flexible - Ignore indentation differences entirely
5. Escape normalized - Convert \n literals to actual newlines
6. Trimmed boundary - Trim first/last line whitespace only
7. Block anchor - Match first+last lines, use similarity for middle
8. Context-aware - 50% line similarity threshold

Multi-occurrence matching is handled via the replace_all flag.

Usage:
    from tools.fuzzy_match import fuzzy_find_and_replace
    
    new_content, match_count, strategy, error = fuzzy_find_and_replace(
        content="def foo():\n    pass",
        old_string="def foo():",
        new_string="def bar():",
        replace_all=False
    )
    N)TupleOptionalListCallable)SequenceMatcher"'z---z... )u   “u   ”u   ‘u   ’u   —u   –u   …    textreturnc                 p    t                                           D ]\  }}|                     ||          } | S )zBNormalizes Unicode characters to their standard ASCII equivalents.)UNICODE_MAPitemsreplace)r   charrepls      6/home/ubuntu/.hermes/hermes-agent/tools/fuzzy_match.py_unicode_normalizer   +   s;    !'')) ( (
d||D$''K    Fcontent
old_string
new_stringreplace_allc           
         |s| dddfS ||k    r| dddfS dt           fdt          fdt          fdt          fd	t          fd
t
          fdt          fdt          fdt          fg	}|D ]\  }} || |          }|rvt          |          dk    r|s| dddt          |           dfc S |dk    rt          | |||          }|r| dd|fc S t          | ||          }	|	t          |          |dfc S | dddfS )a)  
    Find and replace text using a chain of increasingly fuzzy matching strategies.

    Args:
        content: The file content to search in
        old_string: The text to find
        new_string: The replacement text
        replace_all: If True, replace all occurrences; if False, require uniqueness

    Returns:
        Tuple of (new_content, match_count, strategy_name, error_message)
        - If successful: (modified_content, number_of_replacements, strategy_used, None)
        - If failed: (original_content, 0, None, error_description)
    r   Nzold_string cannot be emptyz'old_string and new_string are identicalexactline_trimmedwhitespace_normalizedindentation_flexibleescape_normalizedtrimmed_boundaryunicode_normalizedblock_anchorcontext_aware   zFound zY matches for old_string. Provide more context to make it unique, or use replace_all=True.z1Could not find a match for old_string in the file)_strategy_exact_strategy_line_trimmed_strategy_whitespace_normalized_strategy_indentation_flexible_strategy_escape_normalized_strategy_trimmed_boundary_strategy_unicode_normalized_strategy_block_anchor_strategy_context_awarelen_detect_escape_drift_apply_replacements)
r   r   r   r   
strategiesstrategy_namestrategy_fnmatches	drift_errnew_contents
             r   fuzzy_find_and_replacer9   2   s      >4!===Z4!JJJ 
/"	/0	 "AB	!?@	9:	78	;<	/0	12
.J '1 B B"{+gz22 	B7||a4XS\\ X X X    ''0':zZZ	 7"AtY6666 .gw
KKKGmTAAAA3	B8 AtPPPr   r6   c                      d|vrd|vrdS d                      fd|D                       }dD ]!}||v r||v r||vr|d         }d|d	|d
c S "dS )u'  Detect tool-call escape-drift artifacts in new_string.

    Looks for ``\'`` or ``\"`` sequences that are present in both
    old_string and new_string (i.e. the model copy-pasted them as "context"
    it intended to preserve) but don't exist in the matched region of the
    file. That pattern indicates the transport layer inserted spurious
    shell-style escapes around apostrophes or quotes — writing new_string
    verbatim would literally insert ``\'`` into source code.

    Returns an error string if drift is detected, None otherwise.
    \'\"N c              3   2   K   | ]\  }}||         V  d S N ).0startendr   s      r   	<genexpr>z'_detect_escape_drift.<locals>.<genexpr>   s0      KKZUCgeCi0KKKKKKr   )r;   r<   r&   zNEscape-drift detected: old_string and new_string contain the literal sequence a   but the matched region of the file does not. This is almost always a tool-call serialization artifact where an apostrophe or quote got prefixed with a spurious backslash. Re-read the file with read_file and pass old_string/new_string without backslash-escaping z characters.)join)r   r6   r   r   matched_regionssuspectplains   `      r   r1   r1   w   s      J5
#:#:t ggKKKK7KKKKKO!  j  W
%:%:wo?]?]AJE<(/< < ',< < <   4r   c                 p    t          |d d          }| }|D ]\  }}|d|         |z   ||d         z   }|S )a  
    Apply replacements at the given positions.
    
    Args:
        content: Original content
        matches: List of (start, end) positions to replace
        new_string: Replacement text
    
    Returns:
        Content with replacements applied
    c                     | d         S Nr   r@   xs    r   <lambda>z%_apply_replacements.<locals>.<lambda>   s
    1Q4 r   T)keyreverseN)sorted)r   r6   r   sorted_matchesresultrB   rC   s          r   r2   r2      sX     GFFFNF$ < <
s*,vcdd|;Mr   patternc                     g }d}	 |                      ||          }|dk    rn-|                    ||t          |          z   f           |dz   }J|S )zStrategy 1: Exact string match.r   Tr&   )findappendr0   )r   rT   r6   rB   poss        r   r'   r'      sg    GEll7E**"99S3w<</0111a Nr   c                     d |                     d          D             }d                    |          }|                      d          }d |D             }t          | ||||          S )z
    Strategy 2: Match with line-by-line whitespace trimming.
    
    Strips leading/trailing whitespace from each line before matching.
    c                 6    g | ]}|                                 S r@   striprA   lines     r   
<listcomp>z*_strategy_line_trimmed.<locals>.<listcomp>   s     BBBdTZZ\\BBBr   
c                 6    g | ]}|                                 S r@   r\   r^   s     r   r`   z*_strategy_line_trimmed.<locals>.<listcomp>   s     GGG

GGGr   )splitrE   _find_normalized_matches)r   rT   pattern_linespattern_normalizedcontent_linescontent_normalized_liness         r   r(   r(      s}     CBgmmD.A.ABBBM=11MM$''MGGGGG $ 8#  r   c                 ~    d } ||          } ||           }t          ||          }|sg S t          | ||          S )zC
    Strategy 3: Collapse multiple whitespace to single space.
    c                 .    t          j        dd|           S )Nz[ \t]+r   )resubss    r   	normalizez2_strategy_whitespace_normalized.<locals>.normalize   s    via(((r   )r'   _map_normalized_positions)r   rT   ro   rf   content_normalizedmatches_in_normalizeds         r   r)   r)      sj    ) ) ) #7++"7++ ,,>@RSS  	 %W.@BWXXXr   c           	          |                      d          }d |D             }d |                     d          D             }t          | |||d                    |                    S )z
    Strategy 4: Ignore indentation differences entirely.
    
    Strips all leading whitespace from lines before matching.
    ra   c                 6    g | ]}|                                 S r@   lstripr^   s     r   r`   z2_strategy_indentation_flexible.<locals>.<listcomp>   s     FFFdkkmmFFFr   c                 6    g | ]}|                                 S r@   ru   r^   s     r   r`   z2_strategy_indentation_flexible.<locals>.<listcomp>   s     CCCtT[[]]CCCr   )rc   rd   rE   )r   rT   rg   content_stripped_linesre   s        r   r*   r*      ss     MM$''MFFFFFCCw}}T/B/BCCCM# 6=))  r   c                 N    d } ||          }||k    rg S t          | |          S )zt
    Strategy 5: Convert escape sequences to actual characters.
    
    Handles \n -> newline, \t -> tab, etc.
    c                 ~    |                      dd                               dd                               dd          S )Nz\nra   z\t	z\r)r   rm   s    r   unescapez-_strategy_escape_normalized.<locals>.unescape  s6    yy%%--eT::BB5$OOOr   )r'   )r   rT   r}   pattern_unescapeds       r   r+   r+      sG    P P P !))G##	7$5666r   c           	         |                     d          }|sg S |d                                         |d<   t          |          dk    r|d                                         |d<   d                    |          }|                      d          }g }t          |          }t	          t          |          |z
  dz             D ]}||||z            }|                                }	|	d                                         |	d<   t          |	          dk    r|	d                                         |	d<   d                    |	          |k    r<t          ||||z   t          |                     \  }
}|                    |
|f           |S )z
    Strategy 6: Trim whitespace from first and last lines only.
    
    Useful when the pattern boundaries have whitespace differences.
    ra   r   r&   rV   )rc   r]   r0   rE   rangecopy_calculate_line_positionsrX   )r   rT   re   modified_patternrg   r6   pattern_line_countiblock_linescheck_lines	start_posend_poss               r   r,   r,     s    MM$''M 	 %Q'--//M!
=A)"-3355byy//MM$''M G]++3}%%(::Q>?? 1 1#Aa*<&<$<= "&&(($Q--//A{a)"o3355KO99[!!%555!:q!&8"8#g,," "Iw NNIw/000Nr   originalc                     g }d}| D ]G}|                     |           t                              |          }||t          |          ndz  }H|                     |           |S )u  Build a list mapping each original character index to its normalized index.

    Because UNICODE_MAP replacements may expand characters (e.g. em-dash → '--',
    ellipsis → '...'), the normalised string can be longer than the original.
    This map lets us convert positions in the normalised string back to the
    corresponding positions in the original string.

    Returns a list of length ``len(original) + 1``; entry ``i`` is the
    normalised index that character ``i`` maps to.
    r   Nr&   )rX   r   getr0   )r   rS   norm_posr   r   s        r   _build_orig_to_norm_mapr   =  sr     FH 9 9ht$$!1CIIIq8
MM(Mr   orig_to_normnorm_matchesc                 ,   i }t          | dd                   D ]\  }}||vr|||<   g }t          |           dz
  }|D ]T\  }}||vr
||         }	|	}
|
|k     r#| |
         |k     r|
dz  }
|
|k     r| |
         |k     |                    |	|
f           U|S )zNConvert (start, end) positions in the normalised string to original positions.NrV   r&   )	enumerater0   rX   )r   r   norm_to_orig_startorig_posr   resultsorig_len
norm_startnorm_end
orig_startorig_ends              r   _map_positions_norm_to_origr   R  s     *,'SbS(9:: 4 4(---+3x(%'G<  1$H , 
/ 
/
H///'
3
 !!l8&<x&G&GMH !!l8&<x&G&G 	
H-....Nr   c                     t          |          }t          |           }|| k    r||k    rg S t          ||          }|st          ||          }|sg S t          |           }t	          ||          S )u  Strategy 7: Unicode normalisation.

    Normalises smart quotes, em/en-dashes, ellipsis, and non-breaking spaces
    to their ASCII equivalents in both *content* and *pattern*, then runs
    exact and line_trimmed matching on the normalised copies.

    Positions are mapped back to the *original* string via
    ``_build_orig_to_norm_map`` — necessary because some UNICODE_MAP
    replacements expand a single character into multiple ASCII characters,
    making a naïve position copy incorrect.
    )r   r'   r(   r   r   )r   rT   norm_patternnorm_contentr   r   s         r   r-   r-   o  s     &g..L%g..Lw<7#:#:	"<>>L J-lLII 	*733L&|\BBBr   c           	         t          |          }t          |           }|                    d          }t          |          dk     rg S |d                                         }|d                                         }|                    d          }|                     d          }t          |          }	g }
t	          t          |          |	z
  dz             D ]Y}||                                         |k    r9|||	z   dz
                                           |k    r|
                    |           Zg }t          |
          }|dk    rdnd}|
D ]}|	dk    rd}nfd                    ||dz   ||	z   dz
                     }d                    |dd                   }t          d	||                                          }||k    r<t          ||||	z   t          |                     \  }}|                    ||f           |S )
z
    Strategy 8: Match by anchoring on first and last lines.
    Adjusted with permissive thresholds and unicode normalization.
    ra      r   rV   r&         ?gffffff?g      ?N)
r   rc   r0   r]   r   rX   rE   r   ratior   )r   rT   r   r   re   
first_line	last_linenorm_content_linesorig_content_linesr   potential_matchesr   r6   candidate_count	threshold
similaritycontent_middlepattern_middler   r   s                       r   r.   r.     s5    &g..L%g..L &&t,,M
=A	q!''))Jb!''))I &++D11 t,,]++3)**-??!CDD ( (q!''))Z77q#559:@@BBiOO$$Q'''G+,,O
 (1,,$I 1 1""JJ "YY'9!A#a@R>RST>T:T'UVVN!YY}QrT':;;N(~~NNTTVVJ""!:"Aq+='=s7||" "Iw NNIw/000Nr   c           	      `   |                     d          }|                      d          }|sg S g }t          |          }t          t          |          |z
  dz             D ]}||||z            }d}t          ||          D ]W\  }	}
t	          d|	                                |
                                                                          }|dk    r|dz  }X|t          |          dz  k    r<t          ||||z   t          |                     \  }}|                    ||f           |S )z
    Strategy 9: Line-by-line similarity with 50% threshold.
    
    Finds blocks where at least 50% of lines have high similarity.
    ra   r&   r   Ng?r   )	rc   r0   r   zipr   r]   r   r   rX   )r   rT   re   rg   r6   r   r   r   high_similarity_countp_linec_linesimr   r   s                 r   r/   r/     sN    MM$''MMM$''M 	G]++3}%%(::Q>?? 1 1#Aa*<&<$<= !"!-== 	+ 	+NFF!$GGMMOOCd{{%*% !C$6$6$<<<!:q!&8"8#g,," "Iw NNIw/000Nr   rg   
start_lineend_linecontent_lengthc                     t          d | d|         D                       }t          d | d|         D                       dz
  }||k    r|}||fS )a  Calculate start and end character positions from line indices.

    Args:
        content_lines: List of lines (without newlines)
        start_line: Starting line index (0-based)
        end_line: Ending line index (exclusive, 0-based)
        content_length: Total length of the original content string

    Returns:
        Tuple of (start_pos, end_pos) in the original content
    c              3   :   K   | ]}t          |          d z   V  dS r&   Nr0   r^   s     r   rD   z,_calculate_line_positions.<locals>.<genexpr>  s,      IIdCIIMIIIIIIr   Nc              3   :   K   | ]}t          |          d z   V  dS r   r   r^   s     r   rD   z,_calculate_line_positions.<locals>.<genexpr>  s,      EED#d))a-EEEEEEr   r&   )sum)rg   r   r   r   r   r   s         r   r   r     sn     IImKZK.HIIIIIIEEM)8),DEEEEEIG.   gr   rh   rf   c           	      `   |                     d          }t          |          }g }t          t          |          |z
  dz             D ]d}d                    ||||z                      }	|	|k    r<t	          ||||z   t          |                     \  }
}|                    |
|f           e|S )a  
    Find matches in normalized content and map back to original positions.
    
    Args:
        content: Original content string
        content_lines: Original content split by lines
        content_normalized_lines: Normalized content lines
        pattern: Original pattern
        pattern_normalized: Normalized pattern
    
    Returns:
        List of (start, end) positions in the original content
    ra   r&   )rc   r0   r   rE   r   rX   )r   rg   rh   rT   rf   pattern_norm_linesnum_pattern_linesr6   r   blockr   r   s               r   rd   rd     s      ,11$77.//G3/003DDqHII 	1 	1		21Q9J5J3JKLL&&&!:q!&7"7W" "Iw NNIw/000Nr   
normalizednormalized_matchesc           
         |sg S g }d}d}|t          |           k     r|t          |          k     r| |         ||         k    r |                    |           |dz  }|dz  }n| |         dv rI||         dk    r=|                    |           |dz  }|t          |           k     r| |         dvr|dz  }n?| |         dv r|                    |           |dz  }n|                    |           |dz  }|t          |           k     r|t          |          k     |t          |           k     r:|                    t          |                     |dz  }|t          |           k     :i }i }t          |          D ]\  }}	|	|vr|||	<   |||	<   g }
|D ]\  }|v r	|         }n(t          fdt          |          D                       }|dz
  |v r||dz
           dz   }n||z
  z   }|t          |           k     r,| |         dv r"|dz  }|t          |           k     r
| |         dv "|
                    |t          |t          |                     f           |
S )z
    Map positions from normalized string back to original.
    
    This is a best-effort mapping that works for whitespace normalization.
    r   r&   z 	r   c              3   .   K   | ]\  }}|k    |V  d S r?   r@   )rA   r   nr   s      r   rD   z,_map_normalized_positions.<locals>.<genexpr>_  s+      VV41aa:ooQooooVVr   )r0   rX   r   min)r   r   r   r   orig_idxnorm_idxr   norm_to_orig_endr   r   original_matchesr   r   r   r   s                 @r   rp   rp   $  s	     	 LHH
S]]
"
"x#j//'A'AHH!555)))MHMHHh5((Z-AS-H-H)))MH#h--''HX,>e,K,KAh5(()))MHH )))MH' S]]
"
"x#j//'A'A, S]]
"
"C
OO,,,A S]]
"
"
 '55 . .(---+3x(%-""  2 L L
H++++J7JJ VVVV9\+B+BVVVVVJ a<+++'159HH!X
%:;H X&&8H+=+F+FMH X&&8H+=+F+F 	S3x==-I-I JKKKKr   r      context_linesmax_resultsc                    | r|sdS |                                  }|                                 |rsdS |d                                         }|sd |D             }|sdS |d         }g }t                    D ]\\  }}	|	                                }
|
st          d||
                                          }|dk    r|                    ||f           ]|sdS |                    d            |d|         }g }t                      }|D ]\  }}t          d||z
            t          t                    |t          |          z   |z             }|f}||v rQ|                    |           d                    fd	t          |z
            D                       }|                    |           |sdS d
                    |          S )zFind lines in content most similar to old_string for "did you mean?" feedback.

    Returns a formatted string showing the closest matching lines with context,
    or empty string if no useful match is found.
    r=   r   c                 ^    g | ]*}|                                 |                                 +S r@   r\   )rA   ls     r   r`   z&find_closest_lines.<locals>.<listcomp>  s-    @@@Aaggii@aggii@@@r   Ng333333?c                     | d          S rK   r@   rL   s    r   rN   z$find_closest_lines.<locals>.<lambda>  s    qte r   )rO   ra   c              3   F   K   | ]}|z   d z   dd|z             V  dS )r&   4dz| Nr@   )rA   jrg   rB   s     r   rD   z%find_closest_lines.<locals>.<genexpr>  sY       
 
 qy1}====#;==
 
 
 
 
 
r   z
---
)
splitlinesr]   r   r   r   rX   sortsetmaxr   r0   addrE   r   )r   r   r   r   	old_linesanchor
candidatesscoredr   r_   strippedr   toppartsseen_ranges_line_idxrC   rO   snippetrg   rB   s                       @@r   find_closest_linesr   p  sG     W r%%''I&&((M M r q\!!F @@@@@
 	2A F]++ & &4::<< 	fh77==??3;;MM5!*%%% r KKOOK$$$
+
CE%%K  8Ax-/00#m$$hY&?-&OPPcl+)) 
 
 
 
 
3;''
 
 
 
 
 	W r>>%   r   errormatch_countc                 v    |dk    rdS | r|                      d          sdS t          ||          }|sdS d|z   S )u  Return a '\n\nDid you mean...' snippet for plain no-match errors.

    Gated so the hint only fires for actual "old_string not found" failures.
    Ambiguous-match ("Found N matches"), escape-drift, and identical-strings
    errors all have ``match_count == 0`` but a "did you mean?" snippet would
    be misleading — those failed for unrelated reasons.

    Returns an empty string when there's nothing useful to append.
    r   r=   zCould not findz&

Did you mean one of these sections?
)
startswithr   )r   r   r   r   hints        r   format_no_match_hintr     s^     ar (()9:: rj'22D r6==r   )F)r   r   )!__doc__rk   typingr   r   r   r   difflibr   r   strr   boolintr9   r1   r2   r'   r(   r)   r*   r+   r,   r   r   r-   r.   r/   r   rd   rp   r   r   r@   r   r   <module>r      s   < 
			 2 2 2 2 2 2 2 2 2 2 2 2 # # # # # # SScs	 S S     16BQ BQC BQS BQc BQ)-BQ:?S(SV-YabeYf@f:gBQ BQ BQ BQJ%# %U38_0E %%(%69%>Fsm% % % %P tE#s(O/D RU Z]    6
S 
3 
4c3h3H 
 
 
 
C # $uS#X:O    (YS Y3 Y4cSVhCX Y Y Y Y*C # $uSRUXBW     7 7s 7tE#s(O?T 7 7 7 7&' 'c 'd5c?>S ' ' ' 'Tc d3i    *s)uS#X' 
%S/   :C# C CU3PS8_@U C C C C>5C 5# 5$uS#X:O 5 5 5 5p S  3  4c3h;P        NT#Y C (+=@EJ3PS8_   ( c  $s)  8<S	 '* @C HLUSVX[S[_H]       FI I I37c3h3HIMQRWX[]`X`RaMbI I I IX;! ;!3 ;! ;!S ;![^ ;!gj ;! ;! ;! ;!|> >C >%(>36>;>> > > > > >r   