fGddlZddlZddlZddlZddlmZddlmZmZm Z ddl m Z dZ ddZddZdd ZGd d ZGd d ZdZddZddZdZeddZddZy)N)contextmanager)AnyDictList)languagecdj|}dddd|zdg}tj|}|jtj j jd}|Dcgc] }t|}}|Scc}w)N, nvidia-smi-i0z --query-gpu=z--format=csv,noheader,nounits) join subprocess check_outputdecodesysstdoutencodingsplitint)attrscmdoutretxs W/var/lib/jenkins/workspace/mettalog/venv/lib/python3.12/site-packages/triton/testing.pynvsmir sy HHUOE sNU$:<[ \C  ! !# &C **SZZ(( ) / / 4C a3q6 C  J s-Bc ddl}|dvsJ|jj|jjk(r t d||/|D]*}|j |j dd|_,|jj}|jj|5|ddd|jj|jjd}|jjd}|j|j|j|jj|j|} tdt!|| z } |jj}|jj|5t#| D]} ||D] }d|_ | ddd|jjg} d} t#| D]} |jjd}|jjd}|j|j|j|jj| |j|| z gz } |j%| }t'|||j)S#1swY.xYw#1swYxYw) a+ Benchmark the runtime of the provided function. :param fn: Function to benchmark :type fn: Callable :param rep: Repetition time (in ms) :type rep: int :param grad_to_none: Reset the gradient of the provided tensor to None :type grad_to_none: torch.tensor, optional rNminmaxmeanmedianzQCannot capture graph in default stream. Please use side stream in benchmark code.T enable_timingr )torchcudacurrent_streamdefault_stream RuntimeErrordetach_requires_grad_grad CUDAGraphgraph synchronizeEventrecordreplay elapsed_timer!rrangetensorgetattritem)fnrep grad_to_none return_moder'rg start_event end_event estimate_msn_repeatir n_retriestimess rdo_bench_cudagraphrFs : :: : zz  "ejj&?&?&AAnooDA IIK  T "AF  A   !     JJ**"""6K   t 4IHHJ  JJ**95K1c# +,-H  A   ! xA'%A!AF& D !  JJ CI 9 jj&&T&: JJ$$4$8      ((3h>?? LL E &75+ &u - 2 2 44C  s8K,(K9,K69Lc .|dvsJddl}||jj|r(|jt d|jd}n'|jt d|j d}|jj d } |jj d } | jtd D]} |j|| j|jj| j| d z } td t || z } td t || z }t|Dcgc]}|jj d  } }t|Dcgc]}|jj d  } }t| D] } | t|D]O}||D] }d|_ |j| |j|| |jQ|jj|jt| | Dcgc]\}}|j|c}}|j }|P|j!||j||j j#}t%|d k(r|d}|St'|||j)Scc}wcc}wcc}}w) a Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with the 20-th and 80-th performance percentile. :param fn: Function to benchmark :type fn: Callable :param warmup: Warmup time (in ms) :type warmup: int :param rep: Repetition time (in ms) :type rep: int :param grad_to_none: Reset the gradient of the provided tensor to None :type grad_to_none: torch.tensor, optional :param quantiles: Performance percentile to return in addition to the median. :type quantiles: list[float] :param fast_flush: Use faster kernel to flush L2 between measurements :type fast_flush: bool rrNgAr()dtypedevicegATr$r)rH)r'r(r1emptyrint8r2r3r6zero_r5r!r.r7zipfloatquantiletolistlenr8r9)r:warmupr;r< quantiles fast_flushr=r'cacher?r@_rAn_warmuprBrCrserErs rdo_benchr[Rs$ : :: :D JJ  C O599V L CJejj H**"""6K   t 4I 1X   JJ**959K1c&;./0H1c# +,-HAFxQA5::##$#7QKQ?DXO!!!!5OIO 8_ 8_  #!"  A !  JJ LLK8ST1!..+T\a\g\gL hEnnUELL%++L$NOVVX s8q=a&C &75+ &u - 2 2 447RO(Us'#L#L &L c ddl}ddl}t||js|j |}t||js|j |}|d}t |r||j n|}|d}t |r||j n|}t||jrU|j |jk(r|j}|jjj}t||jrU|j |jk(r|j}|jjj}|jdkDs|jdkDr!|jj||||dy|j||||st|d|d |d |d |d y) Nrg{Gz?grT)atolrtol equal_nan)r]r^ z is not close to z (atol=z, rtol=))numpyr' isinstanceTensorr7callablerHbfloat16rOcpudetachsizetestingassert_allcloseallcloseAssertionError)ryr]r^err_msgnpr's r assert_closerqs a & LLO a & LLO |$TN4=D |$TN4=D!U\\" 77enn $ A EEGNN  " " $!U\\" 77enn $ A EEGNN  " " $ vvzQVVaZ ""1ad"N ;;q!$T; 2y!,=aSvWUYTZZ[\]] 3cteZdZdZ ddeedeededeedeeded eeefd ed ed ed efdZ y) Benchmarkzk This class is used by the :code:`perf_report` function to generate line plots with a concise API. Nx_namesx_valsline_arg line_vals line_names plot_nameargsxlabelylabelx_logy_logc||_||_| |_||_||_||_| |_| |_||_| |_ ||_ ||_ y)a Constructor. x_vals can be a list of scalars or a list of tuples/lists. If x_vals is a list of scalars and there are multiple x_names, all arguments will have the same value. If x_vals is a list of tuples/lists, each element should have the same length as x_names. :param x_names: Name of the arguments that should appear on the x axis of the plot. :type x_names: List[str] :param x_vals: List of values to use for the arguments in :code:`x_names`. :type x_vals: List[Any] :param line_arg: Argument name for which different values correspond to different lines in the plot. :type line_arg: str :param line_vals: List of values to use for the arguments in :code:`line_arg`. :type line_vals: List[Any] :param line_names: Label names for the different lines. :type line_names: List[str] :param plot_name: Name of the plot. :type plot_name: str :param args: Dictionary of keyword arguments to remain fixed throughout the benchmark. :type args: Dict[str, Any] :param xlabel: Label for the x axis of the plot. :type xlabel: str, optional :param ylabel: Label for the y axis of the plot. :type ylabel: str, optional :param x_log: Whether the x axis should be log scale. :type x_log: bool, optional :param y_log: Whether the y axis should be log scale. :type y_log: bool, optional N) rurvr~rwrxryrstylesr|r}rzr{)selfrurvrwrxryrzr{r|r}r~rcolorrs r__init__zBenchmark.__init__sY\     "$    " rr)rFFNN) __name__ __module__ __qualname____doc__rstrrrboolrrrrrtrts:c:S : : 9 : I ::38n:::::rrrtc 8eZdZdZ d dedededefdZd dZy) Markc ||_||_yNr: benchmarks)rr:rs rrz Mark.__init__s$rrbench save_path show_plots print_datac  ddl}ddlm} ddl} |j} |jD cgc]} | d } } |jD cgc]} | d }} t |j }| j|| z| z|z}|jD]} t| t tfs|Dcgc]}| } }t| t|k7rtdt|d| tt|| }ggg}}}|jD]I}|j di||j"|i|j$|} |\} } }|| gz }|| gz }||gz }Kt | |z|z|z|j(t|<|j*r| j-| j/}|d}t1|jD]\}}||dz||dz}} |j2r|j2|dnd}|j2r|j2|dnd}|j5|||||||| j7j9r|j7j9r| j;t<} |j;t<}|j?||| |d | |jA|jC|jDxs||jG|jH|jK|jLrd nd |jO|jPrd nd |r| jS|r8| jU|jVjY||j*d |||jz}|r=|jZddk(r+|j\j_\}}||||z |d<|r1ta|j*dzta|jc|r?|je|jVjY||j*dd|dd|Scc} wcc} wcc}w#t&$r |dd}} } Y8wxYw)Nrz-minz-max)columnsz Expected z values, got r)labelrlsg333333?)alpharloglinearz.pngDiff:z.csvz%.fF) float_formatindexr)3osmatplotlib.pyplotpyplotpandasrylistru DataFramervrctuplerR ValueErrordictrNrxr:rwr{ TypeErrorlocrzfiguresubplot enumeraterplotisnullallastyperO fill_betweenlegend set_xlabelr| set_ylabelr} set_xscaler~ set_yscalershowsavefigpathrshaperrQprint to_stringto_csv)rrrrrdiff_colsave_precisionkwragsrpltpdy_meanry_miny_maxrudfrWx_argsrow_meanrow_minrow_maxrnraxfirst_xrCcolstycol0col1s r_runz Mark._run s'!!%*%5%56A3d66%*%5%56A3d66u}}% \\'F"2U":U"B\ CAa$/ '(1Q((1vW% 9S\N-s!KLL#gq/*F)+RwgH__dggVV5>>1*=VVvV;+.(FE5VH$E7"E7"%#1g07:WDBFF3r7O'* ?? JJLBajG!%"2"231!!f*~r!f*~u,1LLell1oa(d,1LLell1oa(d7 RU!33G||~))+ELLN4F4F4H!LL/E!LL/EOOBwKTQTOU4 IIK MM%,,1' 2 MM%,, ' MM5;;%H = MM5;;%H =  BGGLLu6Gt4LMN %*** +  q(**,JD$DBtH,BvJ  %//C' ( ",,. !  IIbggll90A.FGXZ[iZjjkVl!  # y76 )!;+.d5EF;s# Q Q#, Q(Q--R?Rc t|jt}|r |jgn |j}g}|rRtj|dt tj j|dd} | jd|D]I} |j|j| |||fi||s+ jd| jdK|r! jd| j|r |r|d S|Sy) NT)exist_okz results.htmlwz z z r) rcrrtrmakedirsopenrrwriteappendrrzclose) rrrr return_dfkwargshas_single_benchr result_dfshtmlrs rrunzMark.runPs%dooyA*:doo&   KK D 1 Y?ED JJ' (E   idiiy*j[TZ[ \ ]5??*;:FG   JJ) * JJL !!}$!!rrN)F)FFrF) rrrrrtrrrrrrrrrrs>%chC)CCCSWCJrrrcfd}|S)z Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value. :param benchmarks: Benchmarking configurations. :type benchmarks: List of :class:`Benchmark` ct|Sr)rrs rzperf_report..os b*-rrr)rwrappers` r perf_reportrhs.G Nrrcddl}ddlm}|s|jj }|j j j|d}|j j j|d}||zdzdz d z }|S) z return DRAM bandwidth in GB/s rNrdrivermem_clock_rate mem_bus_widthrg.A)r'runtimerr(current_deviceactiveutilsget_device_properties)rIr'r mem_clock_khz bus_widthbw_gbpss r get_dram_gbpsrssz **,MM''==fEFVWM ##99&A/RIi'!+c1A5G NrrcJddl}ddlm}|s|jj }|j j j|ddz}|jj|}|ddkr||jk(sJd}n||j|jfvrd}nr||j|j|jfvrd}nJ||jtj tj"tj$fvrd }n t'd ||z|zd z}|S) Nrrrmultiprocessor_countriidtype not supported& .>)r'rrr(rrrrget_device_capabilityfloat16float32int32rfint16rLtl float8e4nv float8e4b15float8e5r+ rH clock_raterIr'r num_subcores capabilityops_per_sub_coretflopss rget_max_tensorcore_tflopsrs **,==&&<.decorator..wrappers! rzz|499;I - 3 3 5 G  Y/%Aww''(;(;J(GH!zz&1UXY F*n,nn* +0099<<b!1!1 2!G9A> nnox%L]agjk~~*e,ee*0C OCCC((rr) functoolswraps)r-rr,s` r decoratorz cuda_memcheck..decorators%  ! ) " )"rrr)r,r0s` r cuda_memcheckr1s, rrc #K tjgdtjdddd|d|gtjdddd|d|gtdgd }td gd }t||z d ks Jd |d t||z d ks Jd |d d|z}d|zdz}||ftjgdtjgdtjgdy#tjgdtjgdtjgdwxYww)N)r r r -pmrr r r z--lock-gpu-clocks=r z--lock-memory-clocks=zclocks.current.smrzclocks.current.memoryr&zGPU SMs must run at z MHzg3O?igMbP?)r r r r3r )r r r z-rgc)r r r z-rmc)rrrabs) ref_sm_clock ref_mem_clock cur_sm_clock cur_mem_clockrgbpss r set_gpu_clockr:smC EF    a ~ > !     #M?!M? C !  123A6 678; <,./"4_8L\NZ^6__4==01B6b:N}o]a8bb6)L8&-dl EF AB AB  EF AB ABsEB>DAEAEEcddl}ddlm}|s|jj }|j j j|ddz}|jj}|ddkr/||jk(rd}nW||jk(rd}nEtd ||jk(rd}n(||j|jfvrd}n td ||z|zd z}|S) Nrrrrrr @rr) r'rrr(rrrrrrrr+rfr s rget_max_simd_tflopsr>s **,==&&<rrrrrDs}  %""<5~I5X"^J??D``F :6CC8rr