
    i\                    x   U d Z ddlZddlZddlZddlZddlZddlmZmZm	Z	m
Z
mZ ddlZerddlmZ dae
e         ed<   defdZ G d d	          Z e            Zdd
lmZmZmZ ddlmZ ddlmZmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%  ej&        e'          Z(de)de*fdZ+de,fdZ-de)fdZ.de)de*fdZ/da0da1de
e2ee)e)f         e2e)e
e)         e
e)         f         f                  fdZ3de)fdZ4de*fdZ5de*fdZ6ddZ7de)fdZ8de9e)         fdZ:d Z;da<da=d Z>d Z? ej@        d d!          ZAd"e)d#e,de,fd$ZBd%e,de,fd&ZCdd%e,d(e)deee)e	f                  fd)ZDd*e	de	fd+ZEd,e	deee)e	f                  fd-ZFd%e	deee)e	f                  fd.ZGd/e	dee)e	f         fd0ZHd1ZId2e	de*fd3ZJdd4e
e)         de2e
e	         e
e)         ee)e	f         f         fd5ZKde
e)         fd6ZL ed7d89          ZMd'd'deIfd:e)d;e)d<e)d4e
e)         d=eNde
e)         fd>ZO	 	 	 dd:e)dAe)d4e
e)         dBeNdCe*dDe)de
e)         fdEZPd:e)dAe)d4e
e)         dFeNdGeNde
e)         fdHZQdIe)de)fdJZRdaSdK ZTddMe)dNeNde,fdOZUdPee)         deee)e	f                  fdQZVddMe)dNeNde,fdSZWdPee)         deee)e	f                  fdTZXddMe)dNeNde)fdUZYddVdeIfdPee)         dWe)dXe*d4e
e)         d=eNde)fdYZZddZdVdeIfd;e)d[e)d\e)dXe*d4e
e)         d=eNde)fd]Z[de*fd^Z\de*fd_Z]de*fd`Z^e'dak    r	  e_db            e_dc            e]            Z` e5            Za e* ej@        ddd'          b                                          Zc e* ej@        ded'          b                                          Zd e^            Ze eL            Zfe`r e.            Zg e_dfeg            egdgk    r e_dh           negdik    r e_dj           negdkk    r e_dl           nedrB e_dm ej@        de          b                                h                    dn                      nTecr e_do           nFear e_dp e4                        n- e_dq           n! e_dr            e_ds e8                        ees" e_dt            e_du            e_dv           n e_dwef            e`s eidx            e_dy           eer e_dzef             e_d{eI d|           eMjj        r0 e_d}eMjk                     e_d~eMjl         deMjk         d           n e_d            e_d            e_d            e_d            e_d'            e_d            e_d            e_d'            e_d            e_d            e_d            e_d            e_d           eer e_d            e_d            e_d            e_d'            e_d            e_d            e_d            e_d            e_d            e_d            e_d            e_d'            e_d            e_d            e_d            e_d            e_d            e_d            e_d            e_d            e_d            e_d            e_d            e_d           ddlmmnZnmoZo dddddddddxddRdddMgddZpddddPdddiddRdidPgddZq enjr        ddepd e] e:            dd            enjr        ddeqd e] e:            dVdd	  	         dS )a%  
Standalone Web Tools Module

This module provides generic web tools that work with multiple backend providers.
Backend is selected during ``hermes tools`` setup (web.backend in config.yaml).
When available, Hermes can route Firecrawl calls through a Nous-hosted tool-gateway
for Nous Subscribers only.

Available tools:
- web_search_tool: Search the web for information
- web_extract_tool: Extract content from specific web pages
- web_crawl_tool: Crawl websites with specific instructions

Backend compatibility:
- Exa: https://exa.ai (search, extract)
- Firecrawl: https://docs.firecrawl.dev/introduction (search, extract, crawl; direct or derived firecrawl-gateway.<domain> for Nous Subscribers)
- Parallel: https://docs.parallel.ai (search, extract)
- Tavily: https://tavily.com (search, extract, crawl)

LLM Processing:
- Uses OpenRouter API with Gemini 3 Flash Preview for intelligent content extraction
- Extracts key excerpts and creates markdown summaries to reduce token usage

Debug Mode:
- Set WEB_TOOLS_DEBUG=true to enable detailed logging
- Creates web_tools_debug_UUID.json in ./logs directory
- Captures all tool calls, results, and compression metrics

Usage:
    from web_tools import web_search_tool, web_extract_tool, web_crawl_tool
    
    # Search the web
    results = web_search_tool("Python machine learning libraries", limit=3)
    
    # Extract content from URLs  
    content = web_extract_tool(["https://example.com"], format="markdown")
    
    # Crawl a website
    crawl_data = web_crawl_tool("example.com", "Find contact information")
    N)ListDictAnyOptionalTYPE_CHECKING	Firecrawl_FIRECRAWL_CLS_CACHEreturnc                  .    t           ddlm}  | a t           S )z)Import and cache ``firecrawl.Firecrawl``.Nr   r   )r
   	firecrawlr	   )_clss    4/home/ubuntu/.hermes/hermes-agent/tools/web_tools.py_load_firecrawl_clsr   ?   s(     #//////#    c                   (    e Zd ZdZdZd Zd Zd ZdS )_FirecrawlProxyzNModule-level proxy that looks like ``firecrawl.Firecrawl`` but imports lazily. c                 *     t                      |i |S N)r   )selfargskwargss      r   __call__z_FirecrawlProxy.__call__M   s    $"$$d5f555r   c                 :    t          |t                                S r   )
isinstancer   )r   objs     r   __instancecheck__z!_FirecrawlProxy.__instancecheck__P   s    #244555r   c                     dS )Nz <lazy firecrawl.Firecrawl proxy>r   )r   s    r   __repr__z_FirecrawlProxy.__repr__S   s    11r   N)__name__
__module____qualname____doc__	__slots__r   r   r    r   r   r   r   r   H   sL        XXI6 6 66 6 62 2 2 2 2r   r   )async_call_llmextract_content_or_reasoningget_async_text_auxiliary_client)DebugSession)build_vendor_gateway_urlread_nous_access_tokenresolve_managed_tool_gateway)managed_nous_tools_enabledprefers_gateway)is_safe_url)check_website_accessnamec                 p    t          j        |           }t          |o|                                          S r   )osgetenvboolstrip)r1   vals     r   _has_envr8   m   s*    
)D//C#		$$$r   c                  ~    	 ddl m}   |                                 di           S # t          t          f$ r i cY S w xY w)z5Load the ``web:`` section from ~/.hermes/config.yaml.r   load_configweb)hermes_cli.configr;   getImportError	Exceptionr:   s    r   _load_web_configrA   q   s^    111111{}}  +++#   			s   #& <<c                  z   t                                          d          pd                                                                } | dv r| S dt	          d          pt	          d          pt                      fdt	          d          fd	t	          d
          fdt	          d          ff}|D ]\  }}|r|c S dS )zDetermine which web backend to use.

    Reads ``web.backend`` from config.yaml (set by ``hermes tools``).
    Falls back to whichever API key is present for users who configured
    keys manually without running setup.
    backend )parallelr   tavilyexar   FIRECRAWL_API_KEYFIRECRAWL_API_URLrE   PARALLEL_API_KEYrF   TAVILY_API_KEYrG   EXA_API_KEY)rA   r>   lowerr6   r8   _is_tool_gateway_ready)
configuredbackend_candidatesrC   	availables       r   _get_backendrR   y   s     #$$((339r@@BBHHJJJ??? 
h233px@S7T7TpXnXpXpq	X0112	8,--.	''(	 1   	NNN	 ;r   rC   c                     | dk    rt          d          S | dk    rt          d          S | dk    rt                      S | dk    rt          d          S dS )	z:Return True when the selected backend is currently usable.rG   rL   rE   rJ   r   rF   rK   F)r8   check_firecrawl_api_key)rC   s    r   _is_backend_availablerU      sk    %&&&**++++&((((()))5r   c                     t          j        dd                                          } t          j        dd                                                              d          }| s|sdS i }| r| |d<   |r||d<   |d|pd| pdffS )	zHReturn explicit direct Firecrawl kwargs + cache key, or None when unset.rH   rD   rI   /Napi_keyapi_urldirect)r3   r4   r6   rstrip)rX   rY   r   s      r   _get_direct_firecrawl_configr\      s    i+R006688Gi+R006688??DDG 7 tF $#y $#yHgow$???r   c                       t          d          S )z(Return configured Firecrawl gateway URL.r   )r*   r   r   r   _get_firecrawl_gateway_urlr^      s    #K000r   c                  2    t          dt                    duS )zGReturn True when gateway URL and a Nous Subscriber token are available.r   token_readerN)r,   _read_nous_access_tokenr   r   r   rN   rN      s    'BYZZZbfffr   c                  "    t                      duS )zBReturn True when direct Firecrawl config is explicitly configured.N)r\   r   r   r   _has_direct_firecrawl_configrd      s    '))55r   c                  J    d} t                      r| dz  } t          |           )z>Raise a clear error for unsupported web backend configuration.zWeb tools are not configured. Set FIRECRAWL_API_KEY for cloud Firecrawl or set FIRECRAWL_API_URL for a self-hosted Firecrawl instance.u    With your Nous subscription you can also use the Tool Gateway — run `hermes tools` and select Nous Subscription as the web provider.)r-   
ValueError)messages    r   &_raise_web_backend_configuration_errorrh      s=    	s  "## 
S	
 W

r   c                  (    t                      sdS 	 dS )zAReturn optional managed-gateway guidance for Firecrawl help text.rD   zc, or use the Nous Tool Gateway via your subscription (FIRECRAWL_GATEWAY_URL or TOOL_GATEWAY_DOMAIN))r-   r   r   r   _firecrawl_backend_help_suffixrj      s$    %'' r	9 r   c                  X    g d} t                      r|                     g d           | S )zEReturn tool metadata env vars for the currently enabled web backends.)rL   rJ   rK   rH   rI   )FIRECRAWL_GATEWAY_URLTOOL_GATEWAY_DOMAINTOOL_GATEWAY_SCHEMETOOL_GATEWAY_USER_TOKEN)r-   extend)requiress    r   _web_requires_envrr      sO      H "## 
  	
 	
 	
 Or   c                  f   t                      } | t          d          s| \  }}n_t          dt                    }|(t                              d           t                       |j        |j        d}d|d         |j        f}t          t          |k    rt          S t          d	i |a	|a
t          S )
zGet or create Firecrawl client.

    When ``web.use_gateway`` is set in config, the Tool Gateway is preferred
    even if direct Firecrawl credentials are present.  Otherwise direct
    Firecrawl takes precedence when explicitly configured.
    Nr<   r   r`   zTFirecrawl client initialization failed: missing direct config and tool-gateway auth.)rX   rY   ztool-gatewayrY   r   )r\   r.   r,   rb   loggererrorrh   nous_user_tokengateway_origin_firecrawl_client_firecrawl_client_configr	   )direct_configr   client_configmanaged_gateways       r   _get_firecrawl_clientr}      s     122M )?)?  -60
 
 
 "LLoppp2444 '6&5
 

 9+
 $)A])R)R   "++F++,r   c                      ddl m}  t          1t          j        d          }|st          d           | |          at          S )zwGet or create the Parallel sync client (lazy initialization).

    Requires PARALLEL_API_KEY environment variable.
    r   )ParallelNrJ   VPARALLEL_API_KEY environment variable not set. Get your API key at https://parallel.airX   )rE   r   _parallel_clientr3   r4   rf   )r   rX   s     r   _get_parallel_clientr     sf    
 "!!!!!).// 	:   $8G444r   c                      ddl m}  t          1t          j        d          }|st          d           | |          at          S )zxGet or create the Parallel async client (lazy initialization).

    Requires PARALLEL_API_KEY environment variable.
    r   )AsyncParallelNrJ   r   r   )rE   r   _async_parallel_clientr3   r4   rf   )r   rX   s     r   _get_async_parallel_clientr   1  sf    
 '&&&&&%).// 	:   "/w!?!?!?!!r   TAVILY_BASE_URLzhttps://api.tavily.comendpointpayloadc                 J   t          j        d          }|st          d          ||d<   t           d|                     d           }t
                              d| |           t          j        ||d          }|	                                 |
                                S )zSend a POST request to the Tavily API.

    Auth is provided via ``api_key`` in the JSON body (no header-based auth).
    Raises ``ValueError`` if ``TAVILY_API_KEY`` is not set.
    rK   z\TAVILY_API_KEY environment variable not set. Get your API key at https://app.tavily.com/homerX   rW   zTavily %s request to %s<   )jsontimeout)r3   r4   rf   _TAVILY_BASE_URLlstriprt   infohttpxpostraise_for_statusr   )r   r   rX   urlresponses        r   _tavily_requestr   G  s     i())G 
>
 
 	
 !GI
6
6 4 4
6
6C
KK)8S999z#GR888H==??r   r   c           	         g }t          |                     dg                     D ]^\  }}|                    |                    dd          |                    dd          |                    dd          |dz   d           _dd	|id
S )zNormalize Tavily /search response to the standard web search format.

    Tavily returns ``{results: [{title, url, content, score, ...}]}``.
    We map to ``{success, data: {web: [{title, url, description, position}]}}``.
    resultstitlerD   r   content   )r   r   descriptionpositionTr<   successdata)	enumerater>   append)r   web_resultsiresults       r    _normalize_tavily_search_resultsr   [  s     Kx||Ir::;;  	6ZZ,,::eR((!::i44A	
 
 	 	 	 	 e[%9:::r   rD   fallback_urlc                    g }|                      dg           D ]}|                     d|          }|                     dd          p|                     dd          }|                    ||                     dd          ||||                     dd          dd           |                      d	g           D ]\}|                    |                     d|          ddd|                     d
d          d|                     d|          id           ]|                      dg           D ]F}t          |t                    r|nt          |          }|                    |ddddd|id           G|S )zNormalize Tavily /extract or /crawl response to the standard document format.

    Maps results to ``{url, title, content, raw_content, metadata}`` and
    includes any ``failed_results`` / ``failed_urls`` as error entries.
    r   r   raw_contentrD   r   r   	sourceURLr   r   r   r   r   metadatafailed_resultsru   extraction failedr   )r   r   r   r   ru   r   failed_urls)r>   r   r   str)	r   r   	documentsr   r   rawfailfail_urlurl_strs	            r   _normalize_tavily_documentsr   l  s    ')I,,y"-- 	 	jj--jj++Hvzz)R/H/HZZ,,&)FJJw4K4KLL
 
 	 	 	 	 -r22  88E<00XXg':;;$dhhul&C&CD
 
 	 	 	 	 LL33 	 	(377J((S]]($g.
 
 	 	 	 	 r   valuec           	      l   | dS t          | t          t          t          t          t
          t          f          r| S t          | d          r%	 |                                 S # t          $ r Y nw xY wt          | d          r4	 d | j
                                        D             S # t          $ r Y nw xY w| S )zBConvert SDK objects to plain python data structures when possible.N
model_dump__dict__c                 D    i | ]\  }}|                     d           ||S _
startswith).0kvs      r   
<dictcomp>z$_to_plain_object.<locals>.<dictcomp>  s0    UUUTQ1<<PSCTCTUAqUUUr   )r   dictlistr   intfloatr5   hasattrr   r@   r   items)r   s    r   _to_plain_objectr     s    }t%$c3t<== ul## 	##%%% 	 	 	D	 uj!! 	UUU^%9%9%;%;UUUU 	 	 	D	 Ls$   A   
A-,A-"B$ $
B10B1valuesc                     t          | t                    sg S g }| D ];}t          |          }t          |t                    r|                    |           <|S )z7Normalize mixed SDK/list payloads into a list of dicts.)r   r   r   r   r   )r   
normalizeditemplains       r   _normalize_result_listr     sg    fd## 	')J % % &&eT"" 	%e$$$r   c                 v   t          |           }t          |t                    r|                    d          }t          |t                    rt          |          S t          |t                    rLt          |                    d                    }|r|S t          |                    d                    }|r|S t          |                    d                    }|r|S t          |                    d                    }|r|S t          | d          rt          t          | dg                     S g S )zKExtract Firecrawl search results across SDK/direct/gateway response shapes.r   r<   r   )r   r   r   r>   r   r   r   getattr)r   response_plainr   data_webdata_resultstop_webtop_resultss          r   _extract_web_search_resultsr     s4   %h//N.$'' !!&))dD!! 	0)$///dD!! 	$-dhhuoo>>H  1$((92E2EFFL $##(););E)B)BCC 	N,^-?-?	-J-JKK 	x D%ghr&B&BCCCIr   scrape_resultc                     t          |           }t          |t                    si S |                    d          }t          |t                    r|S |S )zINormalize Firecrawl scrape payload shape across SDK and gateway variants.r   )r   r   r   r>   )r   result_plainnesteds      r   _extract_scrape_payloadr     sV    #M22LlD)) 	f%%F&$ r     clientc                     ddl m} t          t          | dd          pd          } ||          j        pd                                }|dk    p|                    d          S )z?Return True when the resolved auxiliary backend is Nous Portal.r   )urlparsebase_urlrD   znousresearch.comz.nousresearch.com)urllib.parser   r   r   hostnamerM   endswith)r   r   r   hosts       r   _is_nous_auxiliary_clientr     ss    %%%%%%76:r228b99HHX'-24466D%%K7J)K)KKr   modelc                     t          d          \  }}t          j        dd                                          }| p|p|}i }|$t	          |          rddlm}  |            pddgi}|||fS )	zHResolve the current web-extract auxiliary client, model, and extra body.web_extractAUXILIARY_WEB_EXTRACT_MODELrD   Nr   )get_auxiliary_extra_bodytagszproduct=hermes-agent)r(   r3   r4   r6   r   agent.auxiliary_clientr   )r   r   default_modelconfigured_modeleffective_model
extra_bodyr   s          r   _resolve_web_extract_auxiliaryr     s    ;MJJFMy!>CCIIKK@/@=O!#J7??CCCCCC--//UF=S<T3U
?J..r   c                  *    t                      \  } }} |S )zBReturn the current default model for web extraction summarization.r   )r   r   s     r   _get_default_summarizer_modelr     s    022KAuaLr   	web_toolsWEB_TOOLS_DEBUG)env_varr   r   r   
min_lengthc                   K   d}d}d}d}	 t          |           }	|	|k    r'|	dz  }
t                              d|
           d|
dd	S |	|k     rt                              d
|	|           dS g }|r|                    d|            |r|                    d|            |rd                    |          dz   nd}|	|k    r4t                              d|	           t          | ||||           d{V S t                              d|	           t          | ||           d{V }|r\t          |          |k    r|d|         dz   }t          |          }|	dk    r||	z  nd}t                              d|	||dz             |S # t          $ rt}t                              dt          |          dd                    | d|         }t          |           |k    r|d|ddt          |           ddz  }|cY d}~S d}~ww xY w)a  
    Process web content using LLM to create intelligent summaries with key excerpts.
    
    This function uses Gemini 3 Flash Preview (or specified model) via OpenRouter API 
    to intelligently extract key information and create markdown summaries,
    significantly reducing token usage while preserving all important information.
    
    For very large content (>500k chars), uses chunked processing with synthesis.
    For extremely large content (>2M chars), refuses to process entirely.
    
    Args:
        content (str): The raw content to process
        url (str): The source URL (for context, optional)
        title (str): The page title (for context, optional)
        model (str): The model to use for processing (default: google/gemini-3-flash-preview)
        min_length (int): Minimum content length to trigger processing (default: 5000)
        
    Returns:
        Optional[str]: Processed markdown content, or None if content too short or processing fails
    i i  順 r   i@B z<Content too large (%.1fMB > 2MB limit). Refusing to process.z[Content too large to process: z.1fzdMB. Try using web_crawl with specific extraction instructions, or search for a more focused source.]z:Content too short (%d < %d chars), skipping LLM processingNzTitle: zSource: 


rD   z5Content large (%d chars). Using chunked processing...z+Processing content with LLM (%d characters)4

[... summary truncated for context management ...]r         ?z*Content processed: %d -> %d chars (%.1f%%)d   zweb_extract LLM summarization failed (%s). Tip: increase auxiliary.web_extract.timeout in config.yaml or switch to a faster auxiliary model.x   u'   

[Content truncated — showing first , of z chars. LLM summarization timed out. To fix: increase auxiliary.web_extract.timeout in config.yaml, or use a faster auxiliary model. Use browser_navigate for the full page.])lenrt   warningdebugr   joinr   _process_large_content_chunked_call_summarizer_llmr@   r   )r   r   r   r   r   MAX_CONTENT_SIZECHUNK_THRESHOLD
CHUNK_SIZEMAX_OUTPUT_SIZEcontent_lensize_mbcontext_infocontext_strprocessed_contentprocessed_lengthcompression_ratioe	truncateds                     r   process_content_with_llmr    s     8 !OJO@'ll )))!I-GNNY[bccc gW  g  g  g  g  g ##LLUWbdnooo4  	3 1% 1 1222 	2 03 0 0111:FNdii--66B ((KKOQ\]]]7eZ        
 	A;OOO"6wU"S"SSSSSSS 	~$%%77$56F6F$G  KC  %C!  ##455BMPQ// 0; > >WZKKDkScevy|e|}}}     5 FF4C4L		
 	
 	
 ,_,-	w<</))]OS ] ]w<<H] ] ]I %s2   ;F
 "F
 ,BF
 8BF
 

HA)H=HH N  Fr  
max_tokensis_chunk
chunk_infoc           
        K   |rd}d| | d|  d}nd}d| d|  d}d	}d	}	d
}
t          |          D ]}	 t          |          \  }}}||st                              d            d
S d|d|dd|dgd|d}|r||d<   t	          di | d
{V }t          |          } | r| c S t                              d|dz   |           ||dz
  k     r.t          j        |	           d
{V  t          |	d	z  d          }	| c S # t          $ r t                              d           Y  d
S t          $ r}|}
||dz
  k     r~t                              d|dz   |t          |          d
d                    t                              d|	           t          j        |	           d
{V  t          |	d	z  d          }	n|
Y d
}~d
}~ww xY wd
S )a  
    Make a single LLM call to summarize content.
    
    Args:
        content: The content to summarize
        context_str: Context information (title, URL)
        model: Model to use
        max_tokens: Maximum output tokens
        is_chunk: Whether this is a chunk of a larger document
        chunk_info: Information about chunk position (e.g., "Chunk 2/5")
        
    Returns:
        Summarized content or None on failure
    a  You are an expert content analyst processing a SECTION of a larger document. Your job is to extract and summarize the key information from THIS SECTION ONLY.

Important guidelines for chunk processing:
1. Do NOT write introductions or conclusions - this is a partial document
2. Focus on extracting ALL key facts, figures, data points, and insights from this section
3. Preserve important quotes, code snippets, and specific details verbatim
4. Use bullet points and structured formatting for easy synthesis later
5. Note any references to other sections (e.g., "as mentioned earlier", "see below") without trying to resolve them

Your output will be combined with summaries of other sections, so focus on thorough extraction rather than narrative flow.zAExtract key information from this SECTION of a larger document:

z

SECTION CONTENT:
z

Extract all important information from this section in a structured format. Focus on facts, data, insights, and key details. Do not add introductions or conclusions.a~  You are an expert content analyst. Your job is to process web content and create a comprehensive yet concise summary that preserves all important information while dramatically reducing bulk.

Create a well-structured markdown summary that includes:
1. Key excerpts (quotes, code snippets, important facts) in their original format
2. Comprehensive summary of all other important information
3. Proper markdown formatting with headers, bullets, and emphasis

Your goal is to preserve ALL important information while reducing length. Never lose key facts, figures, insights, or actionable information. Make it scannable and well-organized.zNPlease process this web content and create a comprehensive markdown summary:

zCONTENT TO PROCESS:
z

Create a markdown summary that captures all key information in a well-organized, scannable format. Include important quotes and code snippets in their original formatting. Focus on actionable information, specific details, and unique insights.   Nz7No auxiliary model available for web content processingr   systemroler   user皙?taskr   messagestemperaturer  r   z4LLM returned empty content (attempt %d/%d), retryingr   r   z'LLM API call failed (attempt %d/%d): %sr  zRetrying in %ds...r   )ranger   rt   r  r&   r'   asynciosleepminRuntimeErrorr@   r   )r   r  r   r  r  r  system_promptuser_promptmax_retriesretry_delay
last_errorattempt
aux_clientr   r   call_kwargsr   	api_errors                     r   r  r  i  s     ,  &w	~iii i
 	i i iwww w 	w w w KKJ%% +! +!*	!6TUZ6[6[3J!!XYYYtt%(%-@@#<<  #( K  7,6L)+::k::::::::H28<<G NNQSZ]^S^`klllq((mK000000000!+/266NNN 	 	 	NNTUUU444 	! 	! 	!"Jq((H'TU+Wbdghqdrdrswtwswdxyyy3[AAAmK000000000!+/266   	! 4s1   1D+;D)AD?D$G
+	G
4BGG

chunk_sizemax_output_sizec           	        K   g t          dt          |           |          D ]$}| |||z            }                    |           %t                              dt                    |           dt
          dt          dt          t
          t          t                   f         ffdfdt                    D             }t          j        |  d{V }g }	t          |d	 
          D ]%\  }
}|r|	                    d|
dz    d|            &|	st                              d           dS t                              dt          |	          t                               t          |	          dk    r*|	d         }t          |          |k    r|d|         dz   }|S t                              dt          |	                     d                    |	          }d| d d| d}	 t                    \  }}}||sQt                              d           d                    |	          }t          |          |k    r|d|         dz   }|S d|dddd|dgdd d!}|r||d"<   t#          d+i | d{V }t%          |          }|s;t                              d#           t#          d+i | d{V }t%          |          }|sQt                              d$           d                    |	          }t          |          |k    r|d|         dz   }|S t          |          |k    r|d|         d%z   }t          |           }t          |          }|dk    r||z  nd&}t                              d'|||d(z             |S # t&          $ rq}t                              d)t          |          dd(                    d                    |	          }t          |          |k    r|d|         d*z   }|cY d}~S d}~ww xY w),a  
    Process large content by chunking, summarizing each chunk in parallel,
    then synthesizing the summaries.
    
    Args:
        content: The large content to process
        context_str: Context information
        model: Model to use
        chunk_size: Size of each chunk in characters
        max_output_size: Maximum final output size
        
    Returns:
        Synthesized summary or None on failure
    r   z&Split into %d chunks of ~%d chars each	chunk_idxchunk_contentr   c           	        K   	 d| dz    dt                     d}t          |dd|           d{V }|rHt                              d	| dz   t                    t          |          t          |                     | |fS # t          $ rP}t                              d
| dz   t                    t          |          dd                    | dfcY d}~S d}~ww xY w)zSummarize a single chunk.z[Processing chunk r   r  ]i'  T)r  r  r  Nz&Chunk %d/%d summarized: %d -> %d charszChunk %d/%d failed: %s2   )r  r  rt   r   r@   r  r   )r;  r<  r  summaryr  chunksr  r   s        r   summarize_chunkz7_process_large_content_chunked.<locals>.summarize_chunk  sL     	#Oi!mOOVOOOJ0 %        G  DDiRSmUXY_U`U`befsbtbtvy  {B  wC  wC  D  D  Dg%% 	# 	# 	#NN3Y]CKKQTUVQWQWX[Y[X[Q\]]]d?""""""	#s   BB 
C!ACC!C!c                 .    g | ]\  }} ||          S r   r   )r   r   chunkrB  s      r   
<listcomp>z2_process_large_content_chunked.<locals>.<listcomp>  s)    III81e__Q&&IIIr   Nc                     | d         S )Nr   r   )xs    r   <lambda>z0_process_large_content_chunked.<locals>.<lambda>  s
    AaD r   )keyz## Section r   r   zAll chunk summarizations failedzB[Failed to process large content: all chunk summarizations failed]zGot %d/%d chunk summariesz

[... truncated ...]zSynthesizing %d summaries...z

---

a'  You have been given summaries of different sections of a large document. 
Synthesize these into ONE cohesive, comprehensive summary that:
1. Removes redundancy between sections
2. Preserves all key facts, figures, and actionable information
3. Is well-organized with clear structure
4. Is under z characters

zSECTION SUMMARIES:
z,

Create a single, unified markdown summary.z9No auxiliary model for synthesis, concatenating summariesr   r   r!  zdYou synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise.r"  r$  r%  r  r&  r   z3Synthesis LLM returned empty content, retrying onceu>   Synthesis failed after retry — concatenating chunk summariesr  r  z+Synthesis complete: %d -> %d chars (%.2f%%)r  zSynthesis failed: %sz.

[... truncated due to synthesis failure ...]r   )r*  r  r   rt   r   r   r   tupler   r   r+  gathersortedr	  r
  r   r  r&   r'   r@   )r   r  r   r8  r9  r   rD  tasksr   	summariesr;  r@  r   combined_summariessynthesis_promptr5  r   r   fallbackr6  r   final_summaryoriginal_len	final_lencompressionr  rA  rB  s    ``                       @@r   r  r    s     , F1c'llJ//  !j.()e
KK8#f++zRRR# #S #U3PXY\P]K]E^ # # # # # # # #( JIIIy7H7HIIIENE*******G I$W..AAA G G	7 	GE9q=EEGEEFFF T6777SS
KK+S^^S[[III 9~~1v;;((,_,-0IIF KK.I???&++I66
.
 
. 
. 
. 
. 
. 
. 
.72PQV2W2W/
OZ_NNVWWW{{9--H8}}..#$4_$458QQO "$!  /U  V  V,<== 	
 	
  	3(2K%'66+666666664X>>  	CNNPQQQ+::k::::::::H8BBM  	NN[\\\{{9--H8}}..#$4_$458QQO }//)*:?*:;>vvM7||&&	2>2B2Bi,..A<QZ\gjm\mnnn   -s1vvdsd|<<<;;y))x==?** 0 014ffHs.   3A'N CN #A,N 
PA&P PPtextc                 f    d}d}t          j        |d|           }t          j        |d|          }|S )a  
    Remove base64 encoded images from text to reduce token count and clutter.
    
    This function finds and removes base64 encoded images in various formats:
    - (data:image/png;base64,...)
    - (data:image/jpeg;base64,...)
    - (data:image/svg+xml;base64,...)
    - data:image/[type];base64,... (without parentheses)
    
    Args:
        text: The text content to clean
        
    Returns:
        Cleaned text with base64 images replaced with placeholders
    z+\(data:image/[^;]+;base64,[A-Za-z0-9+/=]+\)z'data:image/[^;]+;base64,[A-Za-z0-9+/=]+z[BASE64_IMAGE_REMOVED])resub)rV  base64_with_parens_patternbase64_patterncleaned_texts       r   clean_base64_imagesr]  p  sG    $ "P @N 646NPTUUL 6.*BLQQLr   c                      ddl m}  t          @t          j        d          }|st          d           | |          adt          j        d<   t          S )	zhGet or create the Exa client (lazy initialization).

    Requires EXA_API_KEY environment variable.
    r   )ExaNrL   zLEXA_API_KEY environment variable not set. Get your API key at https://exa.air   zhermes-agentzx-exa-integration)exa_pyr_  _exa_clientr3   r4   rf   headers)r_  rX   s     r   _get_exa_clientrc    sr    
 )M** 	5   c'***3A/0r   
   querylimitc                    ddl m}  |            rdddS t                              d| |           t	                                          | |ddi	          }g }t          |j        pg           D ]P\  }}|j        pg }|	                    |j
        pd
|j        pd
|rd                    |          nd
|dz   d           Qdd|idS )z6Search using the Exa SDK and return results as a dict.r   is_interruptedInterruptedFru   r   zExa search: '%s' (limit=%d)
highlightsT)num_resultscontentsrD    r   r   r   r   r   r<   r   )tools.interruptri  rt   r   rc  searchr   r   rl  r   r   r   r
  )re  rf  ri  r   r   r   r   rl  s           r   _exa_searchrs    s    ......~ :&5999
KK-ue<<<  ''$
 (  H Kx/5266  	6&,"
:#\'R3=E388J///2A	
 
 	 	 	 	 e[%9:::r   urlsc           
      b   ddl m}  |            rd | D             S t                              dt	          |                      t                                          | d          }g }|j        pg D ];}|j        pd}|j	        pd}|j
        pd}|                    ||||||dd	           <|S )
zExtract content from URLs using the Exa SDK.

    Returns a list of result dicts matching the structure expected by the
    LLM post-processing pipeline (url, title, content, metadata).
    r   rh  c                     g | ]}|d dd	S rj  rD   r   ru   r   r   r   us     r   rE  z _exa_extract.<locals>.<listcomp>  "    NNNAMB??NNNr   zExa extract: %d URL(s)T)rV  rD   r   r   )rq  ri  rt   r   r  rc  get_contentsr   rV  r   r   r   )rt  ri  r   r   r   r   r   r   s           r   _exa_extractr}    s    /.....~ ONNNNNN
KK(#d))444  -- .  H
 G"(b 
 
+#jB""&)E::
 
 	 	 	 	 Nr      c           	      6   ddl m}  |            rdddS t          j        dd                                                                          }|dvrd}t                              d	| ||           t                      j	        
                    | g| |t          |d
                    }g }t          |j        pg           D ]P\  }}|j        pg }|                    |j        pd|j        pd|rd                    |          nd|dz   d           Qdd|idS )z;Search using the Parallel SDK and return results as a dict.r   rh  rj  Frk  PARALLEL_SEARCH_MODEagentic)fastzone-shotr  z)Parallel search: '%s' (mode=%s, limit=%d)   )search_queries	objectivemodemax_resultsrD   ro  r   rp  Tr<   r   )rq  ri  r3   r4   rM   r6   rt   r   r   betarr  r-  r   r   excerptsr   r   r   r
  )	re  rf  ri  r  r   r   r   r   r  s	            r   _parallel_searchr    se   ......~ :&59999+Y77==??EEGGD222
KK;UD%PPP#%%*11wrNN	 2  H Kx/5266  	6?(b:#\'R19A388H---rA	
 
 	 	 	 	 e[%9:::r   c           
      B  K   ddl m}  |            rd | D             S t                              dt	          |                      t                      j                            | d           d{V }g }|j        pg D ]Y}|j	        pd}|sd	
                    |j        pg           }|j        pd}|j        pd}|                    ||||||d
d           Z|j        pg D ];}|                    |j        pddd|j        p|j        pdd|j        pdid           <|S )zExtract content from URLs using the Parallel async SDK.

    Returns a list of result dicts matching the structure expected by the
    LLM post-processing pipeline (url, title, content, metadata).
    r   rh  c                     g | ]}|d dd	S rw  r   ry  s     r   rE  z%_parallel_extract.<locals>.<listcomp>  r{  r   zParallel extract: %d URL(s)T)rt  full_contentNrD   r   r   r   r   r   )r   r   r   ru   r   )rq  ri  rt   r   r  r   r  extractr   r  r
  r  r   r   r   errorsr   
error_type)	rt  ri  r   r   r   r   r   r   ru   s	            r   _parallel_extractr    s      /.....~ ONNNNNN
KK-s4yy999/116>> ?        H
 G"(b  %+ 	9kk&/"7R88GjB""&)E::
 
 	 	 	 	 &B  9?]Me&6M:M$eio26
 
 	 	 	 	 Nr   c                 |   	 t          |          }n# t          t          f$ r d}Y nw xY wt          t	          |d          d          }| |dddddd}	 ddlm}  |            rt          d	d
          S t                      }|dk    rt          | |          }t          |                    di                               dg                     |d<   t          j        |dd
          }t          |          |d<   t                              d|           t                                           |S |dk    rt#          | |          }t          |                    di                               dg                     |d<   t          j        |dd
          }t          |          |d<   t                              d|           t                                           |S |dk    rt$                              d| |           t)          d| t          |d          d
d
d          }t+          |          }t          |                    di                               dg                     |d<   t          j        |dd
          }t          |          |d<   t                              d|           t                                           |S t$                              d| |           t-                                          | |          }t1          |          }	t          |	          }
t$                              d|
           dd|	id}|
|d<   t          j        |dd
          }t          |          |d<   t                              d|           t                                           |S # t2          $ r}dt5          |           }t$                              d|           ||d <   t                              d|           t                                           t          |          cY d}~S d}~ww xY w)!aE  
    Search the web for information using available search API backend.

    This function provides a generic interface for web search that can work
    with multiple backends (Parallel or Firecrawl).

    Note: This function returns search result metadata only (URLs, titles, descriptions).
    Use web_extract_tool to get full content from specific URLs.
    
    Args:
        query (str): The search query to look up
        limit (int): Maximum number of results to return (default: 5)
    
    Returns:
        str: JSON string containing search results with the following structure:
             {
                 "success": bool,
                 "data": {
                     "web": [
                         {
                             "title": str,
                             "url": str,
                             "description": str,
                             "position": int
                         },
                         ...
                     ]
                 }
             }
    
    Raises:
        Exception: If search fails or API key is not set
    r~  r   r  re  rf  Nr   )
parametersru   results_countoriginal_response_sizefinal_response_sizerh  rj  Fr   rE   r   r<   r  r   indentensure_asciir  web_search_toolrG   rF   zTavily search: '%s' (limit: %d)rr  r  )re  r  include_raw_contentinclude_imagesz'Searching the web for: '%s' (limit: %d)zFound %d search resultsTr   zError searching web: %sru   )r   	TypeErrorrf   r-  maxrq  ri  
tool_errorrR   r  r  r>   r   dumps_debuglog_callsavers  rt   r   r   r   r}   rr  r   r@   r   r	  )re  rf  debug_call_datari  rC   response_dataresult_jsonr   r   r   r  r  	error_msgs                r   r  r  2  sc   DE

z"   E1s##E 
 
 "# 	 	OR%222222> 	<mU;;;; ..j  ,UE::M/2=3D3DVR3P3P3T3TUZ\^3_3_/`/`OO,*]15QQQK585E5EO12OO-???KKMMMe'u55M/2=3D3DVR3P3P3T3TUZ\^3_3_/`/`OO,*]15QQQK585E5EO12OO-???KKMMMhKK95%HHH!("5"~~',"'	- -  C =SAAM/2=3D3DVR3P3P3T3TUZ\^3_3_/`/`OO,*]15QQQK585E5EO12OO-???KKMMM=ueLLL(**11 2 
 

 2(;;K((-}=== {
 
 ,9( jquMMM14[1A1A-. 	)?;;; % % %4CFF44	T9%%%#, )?;;;)$$$$$$$$%sG    (( N2 6B<N2 3B.N2 "C,N2 C"N2 2
P;<A4P60P;6P;Tformatuse_llm_processingc                 N  ,-K   ddl m} ddlm} | D ]N}|                    |          s|                     ||                    rt          j        ddd          c S O| |||ddddddg g d	}	 t                              d
t          |                      g }	g }
| D ]A}t          |          s|
                    |dddd           ,|	                    |           B|	sg }nt                      }|dk    rt          |	           d{V }n|dk    rt          |	          }n|dk    rXt                              dt          |	                     t          d|	dd          }t!          ||	r|	d         nd          }n6g }|dk    rdg}n|dk    rdg}nddg}g }ddlm} |	D ]} |            r|                    |ddd           't'          |          }|r`t                              d|d         |d                    |                    |dd|d         |d         |d         |d         dd            	 t                              d!|           	 t)          j        t)          j        t/                      j        ||"          d#$           d{V }nK# t(          j        $ r9 t                              d%|           |                    |ddd&d           Y ?w xY wt7          |          }|                    d'i           }d}|                    d          }|                    d          }t;          |t<                    s?t?          |d(          r|                                 }nt?          |d)          r|j!        }ni }|                    d*d          }|                    d+|          }t'          |          }|rbt                              d,|d         |d                    |                    ||dd|d         |d         |d         |d         dd-           |dk    s||r|n|p|pd}|                    |||||d.           # tD          $ rO}t          #                    d/||           |                    |dddtI          |          d0           Y d}~
d}~ww xY w|
r|
|z   }d1|i}t          |                    d1g                     }t                              d2|           ||d3<   t          t          j        |                    |d4<   |ptK                      ,tM                      }|r;|r8t                              d5           |d6                             d7           ,fd8-|                    d1g           }-fd9|D             } t)          j'        |   d{V }!|!D ]\  }"}#}$|"                    d:d;          }|$d<k    rG|d=                             |#           |d>xx         d?z  cc<   t                              d@|           i|$dAk    r7|d=                             |#           t                              dB|           t                              dC|           n|r7|s5t                              dD           |d6                             dE           |                    d1g           D ]W}"|"                    d:d;          }t          |"                    dFd                    }%t                              dG||%           XdH |                    d1g           D             }&d1|&i}'|'                    d1          g k    rtQ          dI          }(tS          |(          })n&t          j        |'dJdK          }(tS          |(          })t          |)          |dL<   |d6                             dM           tT          +                    dN|           tT          ,                                 |)S # tD          $ r}*dOtI          |*           }+t          #                    dP|+           |+|dQ<   tT          +                    dN|           tT          ,                                 tQ          |+          cY d}*~*S d}*~*ww xY w)Ra#  
    Extract content from specific web pages using available extraction API backend.

    This function provides a generic interface for web content extraction that
    can work with multiple backends. Currently uses Firecrawl.

    Args:
        urls (List[str]): List of URLs to extract content from
        format (str): Desired output format ("markdown" or "html", optional)
        use_llm_processing (bool): Whether to process content with LLM for summarization (default: True)
        model (Optional[str]): The model to use for LLM processing (defaults to current auxiliary backend model)
        min_length (int): Minimum content length to trigger LLM processing (default: 5000)

    Security: URLs are checked for embedded secrets before fetching.
    
    Returns:
        str: JSON string containing extracted content. If LLM processing is enabled and successful,
             the 'content' field will contain the processed markdown summary instead of raw content.
    
    Raises:
        Exception: If extraction fails or API key is not set
    r   )
_PREFIX_RE)unquoteFz_Blocked: URL contains what appears to be an API key or token. Secrets must not be sent in URLs.)r   ru   )rt  r  r  r   r   N)r  ru   pages_extractedpages_processed_with_llmr  r  compression_metricsprocessing_appliedz!Extracting content from %d URL(s)rD   :Blocked: URL targets a private or internal network addressr   r   r   ru   rE   rG   rF   zTavily extract: %d URL(s)r  )rt  r  r   markdownhtmlrh  rj  rx  z%Blocked web_extract for %s by rule %sr   rulerg   sourcer   r  r  r   r   r   ru   blocked_by_policyzScraping: %s)r   formatsr   )r   z!Firecrawl scrape timed out for %suc   Scrape timed out after 60s — page may be too large or unresponsive. Try browser_navigate instead.r   r   r   r   r   z0Blocked redirected web_extract for %s by rule %sr   r   r   r   ru   r  r   zScrape failed for %s: %s)r   r   r   r   ru   r   zExtracted content from %d pagesr  r  z3Processing extracted content with LLM (parallel)...r  llm_processingc                   K   |                      dd          }|                      dd          }|                      dd          p|                      dd          }|s| ddfS t          |          }t          |||	
           d{V }|r3t          |          }|d	k    r||z  nd
}|| d<   || d<   ||||	d}| |dfS |||d
ddd}| |dfS )zHProcess a single result with LLM and return updated result with metrics.r   Unknown URLr   rD   r   r   N
no_contentr   r  r   original_sizeprocessed_sizer  
model_used	processedcontent_too_shortr   r  r  r  r  reason	too_shortr>   r  r  )r   r   r   r   r  r  r  r  metricsr   r   s            r   process_single_resultz/web_extract_tool.<locals>.process_single_result  sR     jj66

7B//$jj;;Xvzz)UW?X?X" 6!455 #K 0 0 #;e_j# #      	  8%(^^NJWZ[J[J[(F(Fad% )2F9%,7F=)  #)6*8->&5 G "7K77  #)6*7-0&*"5 G "7K77r   c                 &    g | ]} |          S r   r   )r   r   r  s     r   rE  z$web_extract_tool.<locals>.<listcomp>  s%    NNNv**622NNNr   r   r  r  r  r  r   %s (processed)r  &%s (no processing - content too short)%s (no content to process)PLLM processing requested but no auxiliary model available, returning raw contentllm_processing_unavailabler   %s (%d characters)c                     g | ]h}|                     d d          |                     dd          |                     dd          |                     d          dd|v r
d|d         ini iS r   rD   r   r   ru   r  r  r>   r   rs     r   rE  z$web_extract_tool.<locals>.<listcomp>       	
 	
 	
  uuUB''w++55B//w	 
 GZ]^F^F^)1-@+ABBdf	
 	
 	
r   z%Content was inaccessible or not foundr   r  r  base64_image_removalweb_extract_toolzError extracting content: r  ru   )-agent.redactr  r   r  rr  r   r  rt   r   r  r/   r   rR   r  r}  r   r   rq  ri  r0   r+  wait_for	to_threadr}   scrapeTimeoutErrorr  r   r>   r   r   r   r   r   r@   r	  r   r   check_auxiliary_modelrK  r  r]  r  r  r  ).rt  r  r  r   r   r  r  _urlr  	safe_urlsssrf_blockedr   r   rC   r   r  _is_interruptedblockedr   scrape_payloadr   r   content_markdowncontent_html	final_urlfinal_blockedchosen_content
scrape_errr   r  auxiliary_availableresults_listrM  processed_resultsr   r  statuscontent_lengthtrimmed_resultstrimmed_responser  cleaned_resultr  r  r   r  s.       `                                       @@r   r  r    s4     > ('''''$$$$$$  T"" 	j&7&7&F&F 	: =      	 "4$
 
 $%"# !  O"K%7TCCC 	-/ 	& 	&Cs## &##Y% %    
   %%%%  w	GG"nnG*$$ 1) < <<<<<<<E!!&y11H$$7YHHH%i%&+2 2   6cXaHi	RSgijjj &(Z'')lGGv%%%hGG  *62G 13MMMMMM$ V VC&(( !s]UW'X'XYYY  3377G !$KWU[_^efl^mnnn#&%,Y%7:A&/SZ[aSbnuv~n  2A  2A( (   
 !FNC888%292B ' 1$9$;$;$B(+,3!" !" !"
 )+3 3 3 - - - - - -MM  '3 % % %"NN+NPSTTT#NN'*RB *O, ,    %H% *A)O)O#1#5#5j"#E#E "+9+=+=j+I+I('5'9'9&'A'A  *(D99 .&x>> .+3+>+>+@+@!(:!>!> .+3+<+- !)Wb 9 9 %-LLc$B$B	(<Y(G(G( %"KK(Z\ijp\q  tA  BH  tI  J  J  J#NN'05R`b)6y)A>KF>S]jkq]r  L  MU  V  6W  6W, ,   
 % ?E
>R>RW]WejzWe)9)9  CO  Ci  Sc  Ci  gi#,%*'5+9(0( (     %   %?jQQQ#&%'')+-%(__( (          	-"W,Gw'hll9b99::5GGG-<)*47
88L8L4M4M01B#@#B#B355  I	G"5 I	GKKMNNN01889IJJJ)8 )8 )8 )8 )8 )8X $<<	266LNNNNNNNE&-ne&< < < < < < < ,= 
F 
F'jj66[((#$9:AA'JJJ#$>???1D???KK 0#6666{**#$9:AA'JJJKK H#NNNNNN#?EEEE
F " [*= [qrrr 45<<=YZZZ",,y"55 G Gjj66!$VZZr%B%B!C!C0#~FFFF	
 	
 \\)R00	
 	
 	
 &7	**b00$%LMMK0==NN *%5aeTTTK0==N14^1D1D-.,-445KLLL 	*O<<< % % %9Q99	T9%%%#, *O<<<)$$$$$$$$%s   5G` Q%$AJ'&Q%'AK/+Q%,` .K//EQ%4` 6-Q%#` %
R>/AR93` 9R>>M` 
b$%A4bb$b$basicinstructionsdepthc                 4  ,-.K   | ||||ddddddg g d}	 |pt                      -t                      }t                      }|dk    r|                     d          sd|  } t	          |           st          j        d| d	d	d
dgid          S t          |           }	|	rct          	                    d|	d         |	d                    t          j        d| d	d	|	d         |	d         |	d         |	d         ddgid          S ddl
m}
  |
            rt          dd          S t          	                    d|            | d|d}|r||d<   t          d|          }t          ||           }d|i}t          |                    dg                     }t          	                    d|           ||d<   t          t          j        |                    |d <   |r|rt          	                    d!           |d"                             d#           -fd$,,fd%|                    dg           D             }t%          j        |  d{V }|D ]7\  }}}|d&k    r+|d'                             |           |d(xx         d)z  cc<   8|r7|s5t                              d*           |d"                             d+           d, |                    dg           D             }t          j        d|id-d.          }t+          |          }t          |          |d/<   t,                              d0|           t,                                           |S t3                      s)t          j        d1t5                       d2dd3d          S |                     d          s d|  } t          	                    d4|            |rd5| d6nd	}t          	                    d7| |           t	          |           st          j        d| d	d	d
dgid          S t          |           }	|	rct          	                    d|	d         |	d                    t          j        d| d	d	|	d         |	d         |	d         |	d         ddgid          S dd8d9gid:}|rt          	                    d;           ddl
m}
  |
            rt          dd          S 	  t7                      j        ddd<| i|}n.# t:          $ r!}t                              d=|            d}~ww xY wg }g }t?          |d>          r|j         r|j         ng }t          	                    d?tC          |d@dA                     t          	                    dBt          |                     |st                              dCdD tE          |          D                        t                              d?tC          |d@dE                     t                              dFtC          |dGdE                     t                              dHtC          |dIdE                     ntG          |tH                    rd>|v r|                    d>g           }nt                              dJ           t                              dKtK          |                     t?          |dL          r?t                              dMtM          |j'        (                                                     |D ]}dN}d	} d}!d}"i }#t?          |dO          rU|)                                }$|$                    d9          }!|$                    dP          }"|$                    dQi           }#nt?          |dL          rtC          |d9d          }!tC          |dPd          }"tC          |dQi           }%t?          |%dO          r|%)                                }#nt?          |%dL          r|%j'        }#nptG          |%tH                    r|%}#nXi }#nUtG          |tH                    r@|                    d9          }!|                    dP          }"|                    dQi           }#tG          |#tH                    s?t?          |#dO          r|#)                                }#nt?          |#dL          r|#j'        }#ni }#|#                    dR|#                    d<dN                    }|#                    dSd	          } t          |          }&|&rbt          	                    dT|&d         |&d                    |                    || d	d	|&d         |&d         |&d         |&d         ddU           i|!p|"pd	}'|                    || |'|'|#dV           d|i}t          |                    dg                     }t          	                    d|           ||d<   t          t          j        |                    |d <   |r;|r8t          	                    d!           |d"                             d#           -fdW.|                    dg           }(.fdX|(D             }t%          j        |  d{V }|D ]\  }}}|                    d<dN          }|d&k    rG|d'                             |           |d(xx         d)z  cc<   t          	                    dY|           i|dZk    r7|d'                             |           t          	                    d[|           t                              d\|           n|r7|s5t                              d*           |d"                             d+           |                    dg           D ]W}|                    d<dN          }t          |                    d]d	                    })t          	                    d^||)           Xd_ |                    dg           D             }d|i}*t          j        |*d-d.          }t+          |          }t          |          |d/<   |d"                             d`           t,                              d0|           t,                                           |S # t:          $ r}datU          |           }+t                              db|+           |+|dc<   t,                              d0|           t,                                           t          |+          cY d}~S d}~ww xY w)ea  
    Crawl a website with specific instructions using available crawling API backend.
    
    This function provides a generic interface for web crawling that can work
    with multiple backends. Currently uses Firecrawl.
    
    Args:
        url (str): The base URL to crawl (can include or exclude https://)
        instructions (str): Instructions for what to crawl/extract using LLM intelligence (optional)
        depth (str): Depth of extraction ("basic" or "advanced", default: "basic")
        use_llm_processing (bool): Whether to process content with LLM for summarization (default: True)
        model (Optional[str]): The model to use for LLM processing (defaults to current auxiliary backend model)
        min_length (int): Minimum content length to trigger LLM processing (default: 5000)
    
    Returns:
        str: JSON string containing crawled content. If LLM processing is enabled and successful,
             the 'content' field will contain the processed markdown summary instead of raw content.
             Each page is processed individually.
    
    Raises:
        Exception: If crawling fails or API key is not set
    )r   r  r  r  r   r   Nr   )r  ru   pages_crawledr  r  r  r  r  rF   )zhttp://https://r  r   rD   r  r  F)r  z#Blocked web_crawl for %s by rule %sr   r  rg   r  r  r  rh  rj  r  zTavily crawl: %sr  )r   rf  extract_depthr  crawlr  zCrawled %d pagesr  r  z1Processing crawled content with LLM (parallel)...r  r  c                 z  K   |                      dd          }|                      dd          }|                      dd          }|s| d dfS t          |          }t          |||           d {V }|r8|| d<   || d<   ||t          |          |rt          |          |z  ndd	}| |d
fS |||dd dd}| |dfS )Nr   r  r   rD   r   r  r   r  r  r  r  r  r  r  )	r   page_urlr   r   r  r  r  r   r   s	          r   _process_tavily_crawlz-web_crawl_tool.<locals>._process_tavily_crawle  s     %zz%??H"JJw33E$jjB77G" :%t\99$'LLM&>wRWYhjt&u&u u u u u u uI  <07}-,5y)*2]fijsftftZg8pI8V8Vmp  AP#Q #Q%w;;&.bo47tWjl lG!7K77r   c                 &    g | ]} |          S r   r   )r   r  r  s     r   rE  z"web_crawl_tool.<locals>.<listcomp>w  s%    WWWa..q11WWWr   r  r  r  r   r  r  c                     g | ]h}|                     d d          |                     dd          |                     dd          |                     d          dd|v r
d|d         ini iS r  r  r  s     r   rE  z"web_crawl_tool.<locals>.<listcomp>  s     O O Omn ()uuUB'7'7!%%QSBTBTabafafgprtauau  AB  AF  AF  GN  AO  AO  i  iFY]^F^F^)1-@+ABBdf i O O Or   r   r  r  web_crawl_toolzFweb_crawl requires Firecrawl. Set FIRECRAWL_API_KEY, FIRECRAWL_API_URLz*, or use web_search + web_extract instead.rk  z Added https:// prefix to URL: %sz with instructions: ''zCrawling %s%sr  r  )rf  scrape_optionsz;Instructions parameter ignored (not supported in crawl API)r   zCrawl API call failed: %sr   z
Status: %sr  unknownzRetrieved %d pageszCrawlJob attributes: %sc                 <    g | ]}|                     d           |S r   r   )r   attrs     r   rE  z"web_crawl_tool.<locals>.<listcomp>  s-    8x8x8x$cgcrcrsvcwcw8x8x8x8xr   zN/Az	Total: %stotalzCompleted: %s	completedzUnexpected crawl result typezResult type: %sr   zResult attributes: %sr  r   r  r   r   r   z"Blocked crawled page %s by rule %sr  r   c                 p  K   |                      dd          }|                      dd          }|                      dd          }|s| ddfS t          |          }t          |||	
           d{V }|r3t          |          }|dk    r||z  nd	}|| d
<   || d<   ||||	d}| |dfS |||d	ddd}| |dfS )zNProcess a single crawl result with LLM and return updated result with metrics.r   r  r   rD   r   Nr  r   r  r   r  r  r  r  r  r  )r   r  r   r   r  r  r  r  r  r   r   s            r   process_single_crawl_resultz3web_crawl_tool.<locals>.process_single_crawl_result4  s<     !::e];;

7B// **Y33 6!455 #G #;Xuoz# #      	  8%(^^NJWZ[J[J[(F(Fad% -4F=)(1F9%  ()6*8->&5 G "7K77  ()6*7-0&*"5 G "7K77r   c                 &    g | ]} |          S r   r   )r   r   r  s     r   rE  z"web_crawl_tool.<locals>.<listcomp>a  s%    TTTV0088TTTr   r  r  r  r  r   r  c                     g | ]h}|                     d d          |                     dd          |                     dd          |                     d          dd|v r
d|d         ini iS r  r  r  s     r   rE  z"web_crawl_tool.<locals>.<listcomp>{  r  r   r  zError crawling website: r  ru   r   )+r   r  rR   r   r/   r   r  r0   rt   r   rq  ri  r  r   r   r  r>   r   r+  rK  r  r]  r  r  r  rT   rj   r}   r   r@   r	  r   r   r   dirr   r   typer   r   keysr   r   )/r   r  r  r  r   r   r  r  rC   r  _is_intr   r   r   r   r  rM  r  r   r  r  r  r  r  instructions_textcrawl_paramscrawl_resultr  pages	data_listr   r  r   r  r  r   	item_dictmetadata_objpage_blockedr   r  r  r  r  r  r   r  s/        `                                      @@@r   r  r    s     @ ("4$
 
 $%"# !  O$j%B#@#B#B355.. h>>"9:: '&&& s## rz9sRTVY0[ 0[ /\ #]kpr r r r +3//G QA76?T[\bTcdddz9sRTVahiras29&/7SY?fmnvfw)x)x0z 0z /{ #| KPQ Q Q Q BAAAAAwyy @!-????KK*C000!&' 'G
  7*6'!'733C1#CHHHG!7+H  Y ; ;<<MKK*M:::/<OO,8;DJx<P<P8Q8QO45 " I&9 IOPPP 45<<=MNNN8 8 8 8 8 8$ XWWW8<<	SU;V;VWWW*1.%*@$@$@$@$@$@$@!/@ I I+FGV,,'(=>EEgNNN'(BCCCqHCCC! [*= [qrrr 45<<=YZZZO Orzr~r~  @I  KM  sN  sNO O OO*i%A!Z_```K0==N585H5HO12OO,o>>>KKMMM!! '(( 	#:i:<<i i i   "	# # # # ~~566 	A"S""CKK:C@@@GS[CLCCCCY[OS*;<<< 3 	n:y3PRU,W ,W +X Ygln n n n 's++ 	MKK=wvPWX^P_```:y3PR]den]o.5fowvbijrbs%t%t,v ,v +w x GLM M M M J<
 
  	WKKUVVV======799 	<mU;;;;	80228   LL  	 	 	LL4a888	 ') 	 <(( 	Z-9->F))BIKKglHi&P&PQQQKK,c)nn===  Y68x8x#lJ[J[8x8x8xyyy\7<5+Q+QRRR[',*O*OPPP_glKQV.W.WXXXd++ 	Z,0F0F$((44IINN9:::LL*D,>,>???|Z00 Z4d<;P;U;U;W;W6X6XYYY E	 E	D$HE#LH t\** 4 OO--	#,==#<#< (}}V44$==R88z** 4#*4T#B#B &tVT::  'tZ<<<66 "+6688HH\:66 "+4HHd33 "+HH!HHD$'' 4#'88J#7#7 #xx//88J33 h-- "8\22 "'2244HHXz22 "'0HH!H  ||Ke]1S1STTHLL"--E 099L @,vBVXdekXlmmm#eSU))42>v2FP\]cPdp|  ~F  qG  *H  *H    
  '<,<"GLL"&$      u%HLLB7788&666+8(47
88L8L4M4M01  I	L"5 I	LKKKLLL01889IJJJ)8 )8 )8 )8 )8 )8X $<<	266LTTTT|TTTE&-ne&< < < < < < < ,= 
K 
K'!::e];;[((#$9:AA'JJJ#$>???1D???KK 0(;;;;{**#$9:AA'JJJKK H(SSSSNN#?JJJJ
K " [*= [qrrr 45<<=YZZZ",,y"55 L L!::e];;!$VZZ	2%>%>!?!?0(NKKKK	
 	
 \\)R00	
 	
 	
 &7j!1!%PPP,[9914^1D1D-.,-445KLLL 	(/::: % % %7s1vv77	T9%%%#, (/:::)$$$$$$$$%ss   A9r A3r  r )H+r 6r Br A3r Ar S* )r *
T4TT]8r 
tA4tttc                  :    t                      pt                      S )a  
    Check whether the Firecrawl backend is available.

    Availability is true when either:
    1) direct Firecrawl config (`FIRECRAWL_API_KEY` or `FIRECRAWL_API_URL`), or
    2) Firecrawl gateway origin + Nous Subscriber access token
       (fallback when direct Firecrawl is not configured).

    Returns:
        bool: True if direct Firecrawl or the tool-gateway can be used.
    )rd   rN   r   r   r   rT   rT     s     ())E-C-E-EEr   c                      t                                          dd                                                                          } | dv rt	          |           S t          d dD                       S )z6Check whether the configured web backend is available.rC   rD   )rG   rE   r   rF   c              3   4   K   | ]}t          |          V  d S r   )rU   )r   rC   s     r   	<genexpr>z$check_web_api_key.<locals>.<genexpr>  s+      hh'$W--hhhhhhr   )rA   r>   rM   r6   rU   any)rO   s    r   check_web_api_keyr#    sm    !##''	266<<>>DDFFJ???$Z000hh=ghhhhhhr   c                  .    t                      \  } }}| duS )zICheck if an auxiliary text model is available for LLM content processing.Nr   )r   r   s     r   r  r    s    133LFAqr   __main__u    🌐 Standalone Web Tools Modulez(========================================rH   rI   u   ✅ Web backend: rG   z!   Using Exa API (https://exa.ai)rE   z+   Using Parallel API (https://parallel.ai)rF   z(   Using Tavily API (https://tavily.com)z    Using self-hosted Firecrawl: rW   z#   Using direct Firecrawl cloud APIz!   Using Firecrawl tool-gateway: z0   Firecrawl backend selected but not configuredu$   ❌ No web search backend configuredzWSet EXA_API_KEY, PARALLEL_API_KEY, TAVILY_API_KEY, FIRECRAWL_API_KEY, FIRECRAWL_API_URLu;   ❌ No auxiliary model available for LLM content processingzVSet OPENROUTER_API_KEY, configure Nous Portal, or set OPENAI_BASE_URL + OPENAI_API_KEYuK   ⚠️  Without an auxiliary model, LLM content processing will be disabledu   ✅ Auxiliary model available: r   u!   🛠️  Web tools ready for use!u+   🧠 LLM content processing available with z&   Default min length for processing: z charsu&   🐛 Debug mode ENABLED - Session ID: z    Debug logs will be saved to: z/web_tools_debug_z.jsonu=   🐛 Debug mode disabled (set WEB_TOOLS_DEBUG=true to enable)z
Basic usage:zI  from web_tools import web_search_tool, web_extract_tool, web_crawl_toolz  import asyncioz  # Search (synchronous)z/  results = web_search_tool('Python tutorials')z$  # Extract and crawl (asynchronous)z  async def main():z?      content = await web_extract_tool(['https://example.com'])zC      crawl_data = await web_crawl_tool('example.com', 'Find docs')z  asyncio.run(main())z
LLM-enhanced usage:zC  # Content automatically processed for pages >5000 chars (default)zA  content = await web_extract_tool(['https://python.org/about/'])z#  # Customize processing parametersz$  crawl_data = await web_crawl_tool(z      'docs.python.org',z      'Find key concepts',z,      model='google/gemini-3-flash-preview',z      min_length=3000z  )z  # Disable LLM processingzY  raw_content = await web_extract_tool(['https://example.com'], use_llm_processing=False)z
Debug mode:z  # Enable debug loggingz  export WEB_TOOLS_DEBUG=truez  # Debug logs capture:z$  # - All tool calls with parametersz  # - Original API responsesz  # - LLM compression metricsz  # - Final processed resultsz3  # Logs saved to: ./logs/web_tools_debug_UUID.jsonuL   
📝 Run 'python test_web_tools_llm.py' to test LLM processing capabilities)registryr  
web_searcha  Search the web for information. Returns up to 5 results by default with titles, URLs, and descriptions. The query is passed through to the configured backend, so operators such as site:domain, filetype:pdf, intitle:word, -term, and "exact phrase" may work when the backend supports them.objectstringzThe search query to look up on the web. You may include backend-supported operators such as site:example.com, filetype:pdf, intitle:word, -term, or "exact phrase".)r  r   integerz3Maximum number of results to return. Defaults to 5.r  )r  r   minimummaximumdefaultr  )r  
propertiesrequired)r1   r   r  r   u  Extract content from web page URLs. Returns page content in markdown format. Also works with PDF URLs (arxiv papers, documents, etc.) — pass the PDF link directly and it converts to markdown text. Pages under 5000 chars return full markdown; larger pages are LLM-summarized and capped at ~5000 chars per page. Pages over 2M chars are refused. If a URL fails or times out, use the browser tool to access it instead.arrayr  z:List of URLs to extract content from (max 5 URLs per call))r  r   r   maxItemsr<   c                 t    t          |                     dd          |                     dd                    S )Nre  rD   rf  r~  )rf  )r  r>   r   kws     r   rH  rH  X  s1    txx/D/DDHHU\^_L`L`aaa r   u   🔍r   )r1   toolsetschemahandlercheck_fnrequires_envemojimax_result_size_charsc                     t          t          |                     d          t                    r|                     dg           d d         ng d          S )Nrt  r~  r  )r  r   r>   r   r3  s     r   rH  rH  b  sP    /$.txx/?/?$F$FNRaR  BPZ \  \ r   u   📄)	r1   r5  r6  r7  r8  r9  is_asyncr:  r;  )r   N)rD   r   )r  FrD   )rd  )r~  )sr$   r   loggingr3   rX  r+  typingr   r   r   r   r   r   r   r	   r
   r  __annotations__r   r   r   r&   r'   r(   tools.debug_helpersr)   tools.managed_tool_gatewayr*   r+   rb   r,   tools.tool_backend_helpersr-   r.   tools.url_safetyr/   tools.website_policyr0   	getLoggerr!   rt   r   r5   r8   r   rA   rR   rU   rx   ry   rJ  r\   r^   rN   rd   rh   rj   r   rr   r}   r   r   r   r   r4   r   r   r   r   r   r   r   r   $DEFAULT_MIN_LENGTH_FOR_SUMMARIZATIONr   r   r   r  r   r  r  r  r]  ra  rc  rs  r}  r  r  r  r  r  rT   r#  r  printweb_availabletool_gateway_availabler6   firecrawl_key_availablefirecrawl_url_availablenous_availabledefault_summarizer_modelrC   r[   exitactive
session_idlog_dirtools.registryr&  r  WEB_SEARCH_SCHEMAWEB_EXTRACT_SCHEMAregisterr   r   r   <module>rW     sl  ' ' 'R   				 				  ; ; ; ; ; ; ; ; ; ; ; ; ; ;   $######'+ htn + + + T        2 2 2 2 2 2 2 2 O	         
 - , , , , ,         
 S R R R R R R R ( ( ( ( ( ( 5 5 5 5 5 5		8	$	$
%3 %4 % % % %$    c    6
3 
4 
 
 
 
   @huT#s(^U3PXY\P]_ghk_lKlEm5m/n&o @ @ @ @"1C 1 1 1 1
g g g g g
6d 6 6 6 6
       49    *% % %R     $" " "& 29.0HII c D T    (;t ; ; ; ; ;"% %$ %c %4PTUXZ]U]P^K_ % % % %PC C    .
3 
4S#X+? 
 
 
 
# $tCH~2F    >
3 
4S> 
 
 
 
 (, $Lc Ld L L L L/ /(3- /5RUX`adXegkloqtltguIuCv / / / /x}    
 
k+<	=	=	=
 :a aa	a a C=	a
 a c]a a a aP q qqq C=q 	q
 q q c]q q q qhPPP C=P 	P
 P c]P P P Pfc c    F   *; ;s ;3 ; ; ; ; ;8tCy T$sCx.%9    D; ;C ; ;D ; ; ; ;>($s) (T#s(^0D ( ( ( (VE% E%3 E%s E%3 E% E% E% E%T #:E% E%
s)E%E% E% C=	E%
 E% 	E% E% E% E%T
 #:Z% Z%	Z%Z% Z% 	Z%
 C=Z% Z% 	Z% Z% Z% Z%|F F F F Fi4 i i i it     z 
E
,---	E(OOO &%''M3355"d929-@"#E#E#K#K#M#MNN"d929-@"#E#E#K#K#M#MNN**,,N<<>> 
,..+'++,,,eE56666
""E?@@@@  E<====& JmCV9W9W9]9]9_9_9f9fgj9k9kmmnnnn( J;<<<<' JX:T:T:V:VXXYYYYHIIII45552--//2 2	
 	
 	

  LKLLLfggg[\\\\J0HJJKKK Q	E
-... eV<TVVWWWc7[cccddd } OJv7HJJKKKjjjRXRcjjjkkkkMNNN	E
	E
UVVV	E
	E"III	E
$%%%	E
;<<<	E"III	E
0111	E
   	E
KLLL	E
OPPP	E
!""" k%&&&STTTQRRRb			34444555()))*+++<===%&&&eb			*+++ijjj	E/	E
$%%%	E
)***	E
#$$$	E
0111	E
()))	E
)***	E
)***	E
?@@@	E
YZZZ 0 / / / / / / /  w !  G 
 "T 
 
 I   .  v (+[	 
 H   "  	aa""$$
!	 	 	 	  	\ \""$$
!     r   