
    +hu                       U d dl mZ d dlZd dlZd dlmZmZmZmZm	Z	 d dl
mZ ddlmZ  G d de	      Z ed	d
      Zd*dZd+dZddgfd,dZd-d.dZddgfd,dZd*dZd*dZd*dZd*dZd*dZd*dZd/dZddgf	 	 	 	 	 d0dZd*dZd1dZd2dZ 	 	 	 	 	 	 	 	 d3dZ!d4dZ"d5d Z#d6d!Z$d7d"Z%d8d#Z&d9d:d$Z'd%Z(d&e)d'<   d;d(Z*	 	 	 	 	 	 	 	 	 	 	 	 	 	 d<d)Z+y)=    )annotationsN)AnyTypeVarCallableOptional
NamedTuple)	TypeAlias   )pandasc                  l    e Zd ZU ded<   dZded<   dZded<   dZded<   dZded	<   dZded
<   dZ	ded<   y)RemediationstrnameNzOptional[str]immediate_msgnecessary_msgzOptional[Callable[[Any], Any]]necessary_fnoptional_msgoptional_fn	error_msg)
__name__
__module____qualname____annotations__r   r   r   r   r   r        I/var/www/html/venv/lib/python3.12/site-packages/openai/lib/_validators.pyr   r      sE    
I#'M='#'M='37L07"&L-&26K/6#I}#r   r   OptionalDataFrameTzOptional[pd.DataFrame])boundc                f    d}t        |       |k\  rdnd}dt        |        d| }t        d|      S )z
    This validator will only print out the number of examples and recommend to the user to increase the number of examples if less than 100.
    d    z. In general, we recommend having at least a few hundred examples. We've found that performance tends to linearly increase for every doubling of the number of examplesz
- Your file contains z prompt-completion pairsnum_examplesr   r   )lenr   )dfMIN_EXAMPLESoptional_suggestionr   s       r   num_examples_validatorr(      sO     L r7l" 	 w 
 .c"gY6NObNcdMN-HHr   c                   ddd}d}d}d}| j                   vrO| j                   D cg c]  }t        |      j                          c}v rdfd}|}d d}d d}nd d}t        d	||||
      S c c}w )z[
    This validator will ensure that the necessary column is present in the dataframe.
    c                    | j                   D cg c]!  }t        |      j                         |k(  s |# }}| j                  |d   |j                         id       | S c c}w )Nr   T)columnsinplace)r+   r   lowerrename)r%   columnccolss       r   lower_case_columnz5necessary_column_validator.<locals>.lower_case_column,   sT    ::BaQ6)ABB
		47FLLN3T	B	 Cs
   !AANc                     |       S Nr   )r%   r2   necessary_columns    r   lower_case_column_creatorz=necessary_column_validator.<locals>.lower_case_column_creator9   s    (-=>>r   z
- The `z ` column/key should be lowercasezLower case column name to ``z^` column/key is missing. Please make sure you name your columns/keys appropriately, then retryr5   )r   r   r   r   r   )r%   pd.DataFramer/   r   returnr8   )r%   r8   r9   r8   )r+   r   r-   r   )	r%   r5   r   r   r   r   r0   r6   r2   s	    `      @r   necessary_column_validatorr:   '   s    

 MLMIrzz)

C1ACC? 5L'(8'99YZM9:J9K1MM,-  .L  MI##!   Ds    A<prompt
completionc                F   g }d}d}d}t        | j                        dkD  rh| j                  D cg c]	  }|vs| }}d}|D ]1  }|D cg c]	  }||v s| }	}t        |	      dkD  s&|d| d| dz  }3 d| | }d	| }dfd
}t        d|||      S c c}w c c}w )zK
    This validator will remove additional columns from the dataframe.
    Nr
   r!   r   z9
  WARNING: Some of the additional columns/keys contain `z<` in their name. These will be ignored, and the column/key `z`` will be used instead. This could also result from a duplicate column/key in the provided file.zh
- The input file should contain exactly two columns/keys per row. Additional columns/keys present are: z Remove additional columns/keys: c                    |    S r4   r   xfieldss    r   r   z1additional_column_validator.<locals>.necessary_fn^   s    V9r   additional_columnr   r   r   r   r@   r   r9   r   )r$   r+   r   )
r%   rA   additional_columnsr   r   r   r0   warn_messageacdupss
    `        r   additional_column_validatorrI   K   s    MML
2::)+GAqaGG$ 	BB1=!R1WA=D=4y1}"\]_\`  a]  ^`  ]a  aA  !B  B	B D  EW  DX  Ye  Xf  g:;M:NO	  ##!	  H >s   	BB	BBc                   d}d}d}|    j                  d       j                         s!|    j                         j                         rg|    dk(  |    j                         z  }| j                         j                  |   j                         }d d| }dfd}dt        |       d d	}t        d
 |||      S )zA
    This validator will ensure that no completion is empty.
    Nc                    | dk(  S )Nr!   r   r@   s    r   <lambda>z+non_empty_field_validator.<locals>.<lambda>q   s
    b r   r!   z
- `z?` column/key should not contain empty strings. These are rows: c                <    | |    dk7     j                  g      S )Nr!   subset)dropna)r@   fields    r   r   z/non_empty_field_validator.<locals>.necessary_fnv   s&    QuX^$++E7+;;r   Remove z rows with empty sempty_rC   rD   )applyanyisnullreset_indexindextolistr$   r   )r%   rR   r   r   r   
empty_rowsempty_indexess    `     r   non_empty_field_validatorr^   i   s     MLM	%y()--/2e93C3C3E3I3I3Ki2o"U)*:*:*<=
(..z:AACw&efsetu	< "#m"4!55FugQOeW##!	 r   c                4   | j                        }| j                         j                  |   j                         }d}d}d}t	        |      dkD  r8dt	        |       ddj                         d| }dt	        |       d	}dfd
}t        d|||      S )zY
    This validator will suggest to the user to remove duplicate rows if they exist.
    rO   Nr   
- There are z duplicated -z sets. These are rows: rS   z duplicate rowsc                (    | j                        S )NrO   )drop_duplicatesr?   s    r   r   z.duplicated_rows_validator.<locals>.optional_fn   s    $$F$33r   duplicated_rowsr   r   r   r   rD   )
duplicatedrY   rZ   r[   r$   joinr   )r%   rA   rd   duplicated_indexesr   r   r   s    `     r   duplicated_rows_validatorri      s     mm6m2O)//@GGIMLK
"(-?)@(AchhW]N^M__v  xJ  wK  L %7!8 9I	4 #!	 r   c                    d}d}d}t        |       }|dk7  rBdd |       t              dkD  r(dt               d d}dt               d	}dfd
}t        d|||      S )zW
    This validator will suggest to the user to remove examples that are too long.
    Nopen-ended generationc                    | j                  d d      }| j                         j                  |   j                         S )Nc                ^    t        | j                        t        | j                        z   dkD  S )Ni'  )r$   r;   r<   rL   s    r   rM   zClong_examples_validator.<locals>.get_long_indexes.<locals>.<lambda>   s"    c!((mc!,,>O.ORW.W r      )axis)rV   rY   rZ   r[   )dlong_exampless     r   get_long_indexesz1long_examples_validator.<locals>.get_long_indexes   s6    GG$W^_G`M==?((7>>@@r   r   r`   z. examples that are very long. These are rows: zf
For conditional generation, and for classification the examples shouldn't be longer than 2048 tokens.rS   z long examplesc                     |       }|k7  r/t         j                  j                  dt        |       d| d       | j	                  |      S )NzeThe indices of the long examples has changed as a result of a previously applied recommendation.
The z? long examples to be dropped are now at the following indices: 
)sysstdoutwriter$   drop)r@   long_indexes_to_droprr   long_indexess     r   r   z,long_examples_validator.<locals>.optional_fn   s    '7':$#77JJ$$ A  BE  FZ  B[  A\  \[  \p  [q  qs  t vv233r   rq   re   )rp   r8   r9   r   rD   )infer_task_typer$   r   )r%   r   r   r   ft_typerr   rz   s        @@r   long_examples_validatorr}      s     MLKb!G))	A (+|q ,S->,??mnzm{  |c  dM$S%6$7~FL4 #!	 r   c                b   d}d}d}d}dg d}|D ]t  }|dk(  r4| j                   j                  j                  d      j                         r<| j                   j                  j                  |d      j                         rr| n j	                  dd      }t        |       }|d	k(  rt        d
      S ddt        | j                   d      }	| j                   |	k(  j                         rd|	 d}t        d
|      S |	dk7  r|	j	                  dd      }
d|
 d}t        |	      dkD  r	|d| dz  }| j                   j                  dt        |	        j                  j                  |	d      j                         r|d|	 dz  }nd}|	dk(  rd| d}d fd}t        d||||      S )!z
    This validator will suggest to add a common suffix to the prompt if one doesn't already exist in case of classification or conditional generation.
    Nz


### =>

) ->z

###

z

===

z

---

z

===>

z

--->

r   rt   Fregex\nrk   common_suffixr   suffixc                     | dxx   |z  cc<   | S Nr;   r   r@   r   s     r   
add_suffixz2common_prompt_suffix_validator.<locals>.add_suffix   s    	(vr   xfixzAll prompts are identical: `zt`
Consider leaving the prompts blank if you want to do open-ended generation, otherwise ensure prompts are differentr   r   r!   z 
- All prompts end with suffix `r7   
   R. This suffix seems very long. Consider replacing with a shorter suffix, such as `z5
  WARNING: Some of your prompts contain the suffix `zZ` more than once. We strongly suggest that you review your prompts and add a unique suffixa  
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts emptyzAdd a suffix separator `z` to all promptsc                     |       S r4   r   r@   r   suggested_suffixs    r   r   z3common_prompt_suffix_validator.<locals>.optional_fn       a!122r   common_completion_suffixr   r   r   r   r   r@   r   r   r   r9   r   rD   )
r;   r   containsrW   replacer{   r   get_common_xfixallr$   )r%   r   r   r   r   suffix_optionssuffix_optiondisplay_suggested_suffixr|   r   common_suffix_new_line_handledr   r   s              @@r   common_prompt_suffix_validatorr      s    IMLK (N ( E!yy}}%%d+//199==!!-u!=AAC(  077eDb!G))00 $BIIH=M
		]"'')2=/  Bw  x	9EE)6)>)>tU)K&;<Z;[[\]}"q  sK  rL  LM  N  NM99==.C../33<<]RW<X\\^UVcUd  e  @  @M p12J1KK[\	3 '#! r   c                4   d}d}d}d}t        | j                  d      dk(  rt        d      S dd| j                  k(  j                         rt        d      S dk7  r&d	 d
}|t	              k  r|dz  }d d}dfd}t        d|||      S )zd
    This validator will suggest to remove a common prefix from the prompt if a long one exist.
       Nprefixr   r!   common_prefixr   c                B    | d   j                   t        |      d  | d<   | S r   r   r$   )r@   r   s     r   remove_common_prefixz<common_prompt_prefix_validator.<locals>.remove_common_prefix  s#    kooc&km4(r   z"
- All prompts start with prefix `r7   z. Fine-tuning doesn't require the instruction specifying the task, or a few-shot example scenario. Most of the time you should only add the input data into the prompt, and the desired output into the completionRemove prefix `z` from all promptsc                     |       S r4   r   )r@   r   r   s    r   r   z3common_prompt_prefix_validator.<locals>.optional_fn!  s    +A}==r   common_prompt_prefixre   )r@   r   r   r   r9   r   rD   )r   r;   r   r   r$   )r%   MAX_PREFIX_LENr   r   r   r   r   s        @@r   common_prompt_prefix_validatorr     s     NMLK#BIIH=M00 			]"'')00=m_ANC..  r  rM,]O;MNL> ##!	 r   c                >   d}t        | j                  d      t              dkD  xr d   dk(  t              |k  rt        d      S dd| j                  k(  j	                         rt        d      S d	 d
}d d}dfd}t        d|||      S )zh
    This validator will suggest to remove a common prefix from the completion if a long one exist.
       r   r   r    r   r   c                \    | d   j                   t        |      d  | d<   |rd| d    | d<   | S )Nr<   r   r   )r@   r   	ws_prefixs      r   r   z@common_completion_prefix_validator.<locals>.remove_common_prefix7  s=    L/--c&km<, !!L/!23AlOr   z&
- All completions start with prefix `z_`. Most of the time you should only add the output data into the completion, without any prefixr   z` from all completionsc                     |       S r4   r   )r@   r   r   r   s    r   r   z7common_completion_prefix_validator.<locals>.optional_fnE  s    #A}i@@r   common_completion_prefixre   )r@   r   r   r   r   r   r9   r   rD   )r   r<   r$   r   r   )r%   r   r   r   r   r   r   r   s        @@@r   "common_completion_prefix_validatorr   ,  s     N#BMMAMM"Q&B=+;s+BI
=N*00 	&++-00=m_  Ml  mM$]O3IJLA '#!	 r   c                    d}d}d}d}t        |       }|dk(  s|dk(  rt        d      S t        | j                  d      }| j                  |k(  j	                         rd| d	| d
}t        d|      S dg d}|D ];  }| j                  j
                  j                  |d      j                         r9| n j                  dd      }	dd|dk7  r|j                  dd      }
d|
 d
}t        |      dkD  r	|d|	 d
z  }| j                  j
                  dt        |        j
                  j                  |d      j                         r|d| dz  }nd}|dk(  rd|	 d}d fd}t        d||||      S )!z
    This validator will suggest to add a common suffix to the completion if one doesn't already exist in case of classification or conditional generation.
    Nrk   classificationr   r   r   r   z All completions are identical: `zJ`
Ensure completions are different, otherwise the model will just repeat `r7   r   z [END])	rt   .z ENDz***z+++z&&&z$$$z@@@z%%%Fr   rt   r   c                     | dxx   |z  cc<   | S Nr<   r   r   s     r   r   z6common_completion_suffix_validator.<locals>.add_suffixv  s    	,6!r   r!   z$
- All completions end with suffix `r   r   z9
  WARNING: Some of your completions contain the suffix `zU` more than once. We suggest that you review your completions and add a unique endingaH  
- Your data does not contain a common ending at the end of your completions. Having a common ending string appended to the end of the completion makes it clearer to the fine-tuned model where the completion should end. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples.zAdd a suffix ending `z` to all completionsc                     |       S r4   r   r   s    r   r   z7common_completion_suffix_validator.<locals>.optional_fn  r   r   r   r   r   rD   )
r{   r   r   r<   r   r   r   rW   r   r$   )r%   r   r   r   r   r|   r   r   r   r   r   r   r   s              @@r   "common_completion_suffix_validatorr   P  s    IMLKb!G))W8H-H00#BMMAM
&++-6}o  FQ  R_  Q`  `a  b	9EE  
N ( ==%%m5%AEEG(	
  077eD )6)>)>tU)K&?@^?__`a}"q  sK  rL  LM  N  NM==2M 22377@@V[@\``bYZgYh  i~    M d./G.HH\]	3 '#! r   c                    d
d}d}d}d}| j                   j                  dd j                         dk7  s| j                   j                  d   d   dk7  rd}d}|}t	        d|||	      S )z
    This validator will suggest to add a space at the start of the completion if it doesn't already exist. This helps with tokenization.
    c                6    | d   j                  d       | d<   | S )Nr<   c                8    | j                  d      rd| z   S d| z   S )Nr   r!   )
startswith)rT   s    r   rM   zLcompletions_space_start_validator.<locals>.add_space_start.<locals>.<lambda>  s"    cAR2_`:` X[_`:` r   )rV   rL   s    r   add_space_startz:completions_space_start_validator.<locals>.add_space_start  s     L///0`a,r   Nrn   r   r   z
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detailsz=Add a whitespace character to the beginning of the completioncompletion_space_startre   rD   )r<   r   nuniquevaluesr   )r%   r   r   r   r   s        r   !completions_space_start_validatorr     s    
 LKM	}}!$$&!+r}}/C/CA/Fq/IS/P BV%%#!	 r   c                    dfd}|    j                  d       j                         }|    j                  d       j                         }|dz  |kD  rt        dd d dd	 d
|      S y)zt
    This validator will suggest to lowercase the column values, if more than a third of letters are uppercase.
    c                H    |    j                   j                         | <   | S r4   )r   r-   )r@   r/   s    r   
lower_casez(lower_case_validator.<locals>.lower_case  s"    fIMM'')&	r   c                &    t        d | D              S )Nc              3  d   K   | ](  }|j                         s|j                         s%d  * ywrn   N)isalphaisupper.0r0   s     r   	<genexpr>z9lower_case_validator.<locals>.<lambda>.<locals>.<genexpr>  $     0]qQRQZQZQ\0]   000sumrL   s    r   rM   z&lower_case_validator.<locals>.<lambda>      S0]A0]-] r   c                &    t        d | D              S )Nc              3  d   K   | ](  }|j                         s|j                         s%d  * ywr   )r   islowerr   s     r   r   z9lower_case_validator.<locals>.<lambda>.<locals>.<genexpr>  r   r   r   rL   s    r   rM   z&lower_case_validator.<locals>.<lambda>  r   r   r
   r   z
- More than a third of your `z%` column/key is uppercase. Uppercase zs tends to perform worse than a mixture of case encountered in normal language. We recommend to lower case the data if that makes sense in your domain. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detailsz'Lowercase all your data in column/key `r7   re   NrD   )rV   r   r   )r%   r/   r   count_uppercount_lowers    `   r   lower_case_validatorr     s    
 V*""#]^bbdKV*""#]^bbdKQ$;F8Chiohp  qh  iB6(!L"	
 	
 r   c                   d}d}d}d}d}t         j                  j                  |       r	 | j                         j	                  d      s| j                         j	                  d      r_| j                         j	                  d      rdnd\  }}d| d}d| d	}t        j                  | |t        
      j                  d      }nV| j                         j	                  d      rdd}d}t        j                  |       }	|	j                  }
t        |
      dkD  r|dz  }t        j                  | t              j                  d      }n| j                         j	                  d      rud}d}t        | d      5 }|j                         }t        j                  |j!                  d      D cg c]  }d|g c}|t              j                  d      }ddd       n?| j                         j	                  d      rit        j"                  | dt              j                  d      }t        |      dk(  r/d}d}t        j"                  | t              j                  d      }nn| j                         j	                  d      rj	 t        j"                  | dt              j                  d      }t        |      dk(  r+t        j"                  | t              j                  d      }nd}d}n.d}d | v r|d!|  d"| j!                  d       d#    d$z  }n	|d!|  d%z  }nd*|  d+}t+        d,|||-      }||fS c c}w # 1 sw Y   (xY w# t$        $ r- t        j"                  | t              j                  d      }Y `w xY w# t$        t&        f$ r1 | j!                  d       d#   j)                         }d&|  d'| d(| d)}Y w xY w).z
    This function will read a file saved in .csv, .json, .txt, .xlsx or .tsv format using pandas.
     - for .xlsx it will read the first sheet
     - for .txt it will assume completions and split on newline
    Nz.csvz.tsv)CSV,)TSV	z=
- Based on your file extension, your file is formatted as a z filezYour format `z` will be converted to `JSONL`)sepdtyper!   z.xlsxzH
- Based on your file extension, your file is formatted as an Excel filez/Your format `XLSX` will be converted to `JSONL`rn   z
- Your Excel file contains more than one sheet. Please either save as csv or ensure all data is present in the first sheet. WARNING: Reading only the first sheet...)r   z.txtz9
- Based on your file extension, you provided a text filez.Your format `TXT` will be converted to `JSONL`rrt   )r+   r   .jsonlT)linesr   z^
- Your JSONL file appears to be in a JSON format. Your file will be converted to JSONL formatz/Your format `JSON` will be converted to `JSONL`z.jsonz^
- Your JSON file appears to be in a JSONL format. Your file will be converted to JSONL formatz]Your file must have one of the following extensions: .CSV, .TSV, .XLSX, .TXT, .JSON or .JSONLr   z Your file `z` ends with the extension `.z` which is not supported.z` is missing a file extension.zYour file `z!` does not appear to be in valid z9 format. Please ensure your file is formatted as a valid z file.zFile z does not exist.read_any_format)r   r   r   r   )ospathisfiler-   endswithpdread_csvr   fillna	ExcelFilesheet_namesr$   
read_excelopenread	DataFramesplit	read_json
ValueError	TypeErrorupperr   )fnamerA   remediationr   r   r   r%   file_extension_str	separatorxlssheetsfcontentlines                 r   r   r     s    KMMI	B	ww~~e<	v{{}%%f-1G1G1O@E@V@VW]@^dq-"ITUgThhmn  #00B/CCa b[[ISAHHL''0 k Qll5)v;?!  &N  NM]]54;;B?''/ \ P%% !ffhG07d0CD"dD &! fRj	 ! ! ''1\\%t3?FFrJr7a< %FM$UMe37>>rBB''0Ce4sCJJ2NB2w!|\\%s;BB2F )J(Y t  %<<w6RSXS^S^_bScdfSgRh  iB  "C  CI<w6T!UUI E7"23	##	K {?c E! !6 " Ce37>>rBBC I& 	v!&S!1"!5!;!;!=%eW,MN`Ma  b[  \n  [o  ou  vI	vsb   EN ;3L>.L9
9L>B1N A(M
 0/N 9L>>MN 
3N =N ?N  N =OOc                L    t        |       }d}|dk(  rd| d}t        d|      S )z
    This validator will infer the likely fine-tuning format of the data, and display it to the user if it is classification.
    It will also suggest to use ada and explain train/validation split benefits.
    Nr   zK
- Based on your data it seems like you're trying to fine-tune a model for z
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for trainingr"   r#   )r{   r   )r%   r|   r   s      r   format_inferrer_validatorr    sA    
 b!GM""fgnfo  pU  VN-HHr   c                `   |j                   Ot        j                  j                  d|j                   d|j                    d       t        j
                  d       |j                  )t        j                  j                  |j                         |j                  |j                  |       } | S )zs
    This function will apply a necessary remediation to a dataframe, or print an error message if one exists.
    z

ERROR in z validator: z

Aborting...rn   )	r   ru   stderrrw   r   exitr   rv   r   )r%   r   s     r   apply_necessary_remediationr  (  s     (

=)9)9(:,{G\G\F]]lmn  ,

223+%%b)Ir   c                    t         j                  j                  |        |r t         j                  j                  d       yt               j	                         dk7  S )NzY
Tn)ru   rv   rw   inputr-   )
input_textauto_accepts     r   accept_suggestionr  6  s?    JJZ 

7==?c!!r   c                   d}d|j                    d}|j                   -t        ||      r!|j                  J |j                  |       } d}|j                  -t        j
                  j                  d|j                   d       | |fS )zc
    This function will apply an optional remediation to a dataframe, based on the user input.
    Fz- [Recommended] z [Y/n]: Tz- [Necessary] rt   )r   r  r   r   ru   rv   rw   )r%   r   r
  optional_appliedr	  s        r   apply_optional_remediationr  >  s     #K$<$<#=XFJ+Z5**666((,B#  ,

>+*C*C)DBGHr   c                    t        |       }d}|dk(  rt        |       }|dz  }n%| j                  d      j                         }|dz  }dd} ||dz         }t        j
                  j                  d	| d
       y)z?
    Estimate the time it'll take to fine-tune the dataset
    g      ?r   g
ףp=
?T)rZ   g|?5^?c                    | dk  rt        | d       dS | dk  rt        | dz  d       dS | dk  rt        | dz  d       dS t        | dz  d       dS )	N<   r
   z secondsi  z minutesiQ z hoursz days)round)times    r   format_timez.estimate_fine_tuning_time.<locals>.format_time]  sv    "9D!n%X..D[D2Iq)*(33E\D4K+,F33D5L!,-U33r      z:Once your model starts training, it'll approximately take z~ to train a `curie` model, and less for `ada` and `babbage`. Queue will approximately take half an hour per job ahead of you.
N)r  floatr9   r   )r{   r$   memory_usager   ru   rv   rw   )r%   	ft_formatexpected_timer"   sizer  time_strings          r   estimate_fine_tuning_timer  P  s      #IM$$2w$t+T*..0v4 mc12KJJ
D[M  RQ  	Rr   c                    |rddgndg}d}	 |dkD  rd| dnd}|D cg c],  }t         j                  j                  |       d    d| | d. }}t        d	 |D              s|S |d
z  }^c c}w )N_train_validr!   r   z ()	_preparedr   c              3  Z   K   | ]#  }t         j                  j                  |       % y wr4   )r   r   r   )r   r   s     r   r   z get_outfnames.<locals>.<genexpr>s  s     ?277>>!$?s   )+rn   )r   r   splitextrW   )r   r   suffixesiindex_suffixr   candidate_fnamess          r   get_outfnamesr(  m  s    ',(#2$H	A
$%EA3ayrowxekrww//6q9:)F8L>Y_`xx?.>??##	Q xs   1A+c                    | j                   j                         }d }|dk(  r'| j                   j                         j                  d   }||fS )Nr
   r   )r<   r   value_countsrZ   )r%   	n_classes	pos_classs      r   get_classification_hyperparamsr-  x  sH    %%'IIA~MM..066q9	ir   c                    t        |       }t        | j                  d      }t        | j                  d      }d}d}|dk(  rt	        ||      rd}d}	|j                  dd	      }
|j                  dd	      }t        |      d
kD  rd| dnd}d}|s:|s8t        j                  j                  d| d|	 d|
 d| d	       t        |        y*t	        ||      rt        ||      }|rt        |      dk(  rd|d
   v rd|d   v sJ d}t        t        |       |z
  t        t        |       dz              }| j                  |d      }| j                  |j                         }|ddg   j#                  |d
   ddd       |ddg   j#                  |d   ddd       t%        |       \  }}|	dz  }	|dk(  r
|	d| dz  }	n6|	d | z  }	n-t        |      dk(  sJ | ddg   j#                  |d
   ddd       |rd!ndd"z   d#j'                  |      z   }|r	d$|d    dnd}t        |
      d
k(  rdnd%|
 d}t        j                  j                  d&| d'|d
    d| |	 d(| | d       t        |        y*t        j                  j                  d)       y*)+aQ  
    This function will write out a dataframe to a file, if the user would like to proceed, and also offer a fine-tuning command with the newly created file.
    For classification it will optionally ask the user if they would like to split the data into train/valid files, and modify the suggested command to include the valid set.
    r   r   FzQ- [Recommended] Would you like to split into training and validation set? [Y/n]: r   Tr!   rt   r   r   z Make sure to include `stop=["z;"]` so that the generated texts ends at the expected place.z@

Your data will be written to a new JSONL file. Proceed [Y/n]: zK
You can use your file for fine-tuning:
> openai api fine_tunes.create -t ""ue   

After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `zX` for the model to start generating completions, rather than continuing with the prompt.r
   trainvalidrn   i  g?*   )r  random_stater;   r<   records)r   orientforce_asciiz! --compute_classification_metricsz" --classification_positive_class "z --classification_n_classes rT   z to `z` and `z -v "uc   After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `z
Wrote modified filezd`
Feel free to take a look!

Now use that file when fine-tuning:
> openai api fine_tunes.create -t "z

z#Aborting... did not write the file
N)r{   r   r;   r<   r  r   r$   ru   rv   rw   r  r(  maxintsamplerx   rZ   to_jsonr-  rg   )r%   r   any_remediationsr
  r  common_prompt_suffixr   r   r	  additional_params%common_prompt_suffix_new_line_handled)common_completion_suffix_new_line_handledoptional_ending_stringfnamesMAX_VALID_EXAMPLESn_traindf_traindf_validr+  r,  files_stringvalid_stringseparator_reminders                          r   write_out_filerI    s   
  #I*2998D.r}}8LEdJ$$Z5E,@,H,Hu,U)0H0P0PQUW\0]- 89A= ))R(S  TO  	P  VJE

[\a[bbcducv  w^  _D  ^E  E]  ^t  ]u  uw  x	
 	""%	:{	3ue,v;!#6!9(<FSTIAUUU!%#b'$66CGcM8JKGyy7y<Hwwx~~.Hh-.66q	iU 7  h-.66vayU^lq6r#A"#E Iy!DDA~!'I)TU%VV!!'CI;%OO!v;!###,'(00$yfk0l  %"79>>&;QR/4vayk+" 89Q> v  x]  w^  ^v  w 	
 	

#L>  2Z  [a  bc  [d  Ze  ef  gs  ft  uF  tG  GK  L^  K_  `v  _w  wy  z	
 	""%

?@r   c                    d}t        | j                  j                  j                               dk(  ryt        | j                  j                               t        |       |z  k  ryy)z>
    Infer the likely fine-tuning task type from the data
       r   rk   r   zconditional generation)r   r;   r   r$   r<   unique)r%   CLASSIFICATION_THRESHOLDs     r   r{   r{     sU      !
299==1$&
2==!"SW/G%GG#r   c                    d}	 |dk(  r| j                   t        |      dz    d n| j                   dt        |      dz    }|j                         dk7  r	 |S ||j                  d   k(  r	 |S |j                  d   }w)zQ
    Finds the longest common suffix or prefix of all the values in a series
    r!   r   rn   Nr   )r   r$   r   r   )seriesr   common_xfixcommon_xfixess       r   r   r     s     K
59X5EFJJ[)A-.016::VlX[\gXhklXlKm 	   "a'
 	 M0033  (..q1K r   z,Callable[[pd.DataFrame], Remediation | None]r	   	Validatorc                     t         d d t        t        t        t        t
        d d t        t        t        t        t        gS )Nc                    t        | d      S r   r:   rL   s    r   rM   z get_validators.<locals>.<lambda>  s    ,Q9 r   c                    t        | d      S r   rU  rL   s    r   rM   z get_validators.<locals>.<lambda>  s    ,Q= r   c                    t        | d      S r   r   rL   s    r   rM   z get_validators.<locals>.<lambda>  s    &q(3 r   c                    t        | d      S r   rX  rL   s    r   rM   z get_validators.<locals>.<lambda>  s    &q,7 r   )r(   rI   r^   r  ri   r}   r   r   r   r   r   r   r   r   get_validatorsrZ    s9    9=#!!!37&&**) r   c                0   g }||j                  |       |D ]*  } ||       }||j                  |       t        | |      } , t        |D cg c]  }|j                  |j                  | c}      }t        |D cg c]  }|j                  | c}      }	d}
|r=t
        j                  j                  d       |D ]  }t        | ||      \  } }|
xs |}
 nt
        j                  j                  d       |
xs |	} || |||       y c c}w c c}w )NFz?

Based on the analysis we will perform the following actions:
z

No remediations found.
)	appendr  rW   r   r   ru   rv   rw   r  )r%   r   r   
validatorsr
  write_out_file_funcoptional_remediations	validator&any_optional_or_necessary_remediationsany_necessary_appliedany_optional_appliedr  !any_optional_or_necessary_applieds                r   apply_validatorsre    sB    02$$[1 >	m"!((5,R=B	> .1  5	
''3{7P7P7\ 	
.*  (=gAZAZAfg !-

]^0 	LK#=b+{#[ B #7#K;K 	L 	

78(<(U@U%E#DkR+	
 	hs   !D?DD)r%   r8   r9   r   )r%   r8   r5   r   r9   r   )r%   r8   rA   	list[str]r9   r   )r<   )r%   r8   rR   r   r9   r   )r%   r8   r/   r   r9   Remediation | None)r   r   rA   rf  r9   z'tuple[pd.DataFrame | None, Remediation])r%   r   r   r   r9   r   )r	  r   r
  boolr9   rh  )r%   r8   r   r   r
  rh  r9   ztuple[pd.DataFrame, bool])r%   r8   r9   None)r   r   r   rh  r9   rf  )r%   r8   r9   ztuple[int, object])
r%   r8   r   r   r;  rh  r
  rh  r9   ri  )r%   r8   r9   r   )r   )rO  r   r   r   r9   r   )r9   list[Validator])r%   r8   r   r   r   rg  r]  rj  r
  rh  r^  zCallable[..., Any]r9   ri  ),
__future__r   r   ru   typingr   r   r   r   r   typing_extensionsr	   _extrasr   r   r   r   r(   r:   rI   r^   ri   r}   r   r   r   r   r   r   r   r  r  r  r  r  r(  r-  rI  r{   r   rR  r   rZ  re  r   r   r   <module>ro     sx   " 	 
 ? ? ' "$* $ 19QR I!H HPQ]F^ <4 FN|D\ 2"JAH$N!HAH2. &.|$<VV!V,Vr	I"  #. =A  $: DAN$$ F	9 E('S'S'S $'S  	'S
 'S ,'S 
'Sr   