Actual source code: xmllogevent.c
1: /*************************************************************************************
2: * M A R I T I M E R E S E A R C H I N S T I T U T E N E T H E R L A N D S *
3: *************************************************************************************
4: * authors: Bas van 't Hof, Koos Huijssen, Christiaan M. Klaij *
5: *************************************************************************************
6: * content: Support for nested PetscTimers *
7: *************************************************************************************/
8: #include <petsclog.h>
9: #include <petsc/private/logimpl.h>
10: #include <petsctime.h>
11: #include <petscviewer.h>
12: #include "../src/sys/logging/xmlviewer.h"
14: #if defined(PETSC_USE_LOG)
16: /*
17: * Support for nested PetscTimers
18: *
19: * PetscTimers keep track of a lot of useful information: Wall clock times,
20: * message passing statistics, flop counts. Information about the nested structure
21: * of the timers is lost. Example:
22: *
23: * 7:30 Start: awake
24: * 7:30 Start: morning routine
25: * 7:40 Start: eat
26: * 7:49 Done: eat
27: * 7:43 Done: morning routine
28: * 8:15 Start: work
29: * 12:15 Start: eat
30: * 12:45 Done: eat
31: * 16:00 Done: work
32: * 16:30 Start: evening routine
33: * 18:30 Start: eat
34: * 19:15 Done: eat
35: * 22:00 Done: evening routine
36: * 22:00 Done: awake
37: *
38: * Petsc timers provide the following timer results:
39: *
40: * awake: 1 call 14:30 hours
41: * morning routine: 1 call 0:13 hours
42: * eat: 3 calls 1:24 hours
43: * work: 1 call 7:45 hours
44: * evening routine 1 call 5:30 hours
45: *
46: * Nested timers can be used to get the following table:
47: *
48: * [1 call]: awake 14:30 hours
49: * [1 call]: morning routine 0:13 hours ( 2 % of awake)
50: * [1 call]: eat 0:09 hours (69 % of morning routine)
51: * rest (morning routine) 0:04 hours (31 % of morning routine)
52: * [1 call]: work 7:45 hours (53 % of awake)
53: * [1 call]: eat 0:30 hours ( 6 % of work)
54: * rest (work) 7:15 hours (94 % of work)
55: * [1 call]: evening routine 5:30 hours (38 % of awake)
56: * [1 call]: eat 0:45 hours (14 % of evening routine)
57: * rest (evening routine) 4:45 hours (86 % of morning routine)
58: *
59: * We ignore the concept of 'stages', because these seem to be conflicting notions, or at least,
60: * the nested timers make the stages unnecessary.
61: *
62: */
64: /*
65: * Data structures for keeping track of nested timers:
66: *
67: * nestedEvents: information about the timers that have actually been activated
68: * dftParentActive: if a timer is started now, it is part of (nested inside) the dftParentActive
69: *
70: * The Default-timers are used to time the nested timers. Every nested timer corresponds to
71: * (one or more) default timers, where one of the default timers has the same event-id as the
72: * nested one.
73: *
74: * Because of the risk of confusion between nested timer ids and default timer ids, we
75: * introduce a typedef for nested events (NestedEventId) and use the existing type PetscLogEvent
76: * only for default events. Also, all nested event variables are prepended with 'nst', and
77: * default timers with 'dft'.
78: */
80: #define DFT_ID_AWAKE -1
82: typedef PetscLogEvent NestedEventId;
83: typedef struct {
84: NestedEventId nstEvent; /* event-code for this nested event, argument 'event' in PetscLogEventStartNested */
85: int nParents; /* number of 'dftParents': the default timer which was the dftParentActive when this nested timer was activated */
86: PetscLogEvent *dftParentsSorted; /* The default timers which were the dftParentActive when this nested event was started */
87: PetscLogEvent *dftEvents; /* The default timers which represent the different 'instances' of this nested event */
89: PetscLogEvent *dftParents; /* The default timers which were the dftParentActive when this nested event was started */
90: PetscLogEvent *dftEventsSorted; /* The default timers which represent the different 'instances' of this nested event */
91: } PetscNestedEvent;
93: static PetscLogEvent dftParentActive = DFT_ID_AWAKE;
94: static int nNestedEvents = 0;
95: static int nNestedEventsAllocated = 0;
96: static PetscNestedEvent *nestedEvents = NULL;
97: static PetscLogDouble thresholdTime = 0.01; /* initial value was 0.1 */
99: #define THRESHOLD (thresholdTime/100.0+1e-12)
101: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
102: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
103: PETSC_INTERN PetscErrorCode PetscLogView_Nested(PetscViewer);
104: PETSC_INTERN PetscErrorCode PetscLogView_Flamegraph(PetscViewer);
106: /*@C
107: PetscLogNestedBegin - Turns on nested logging of objects and events. This logs flop
108: rates and object creation and should not slow programs down too much.
110: Logically Collective over PETSC_COMM_WORLD
112: Options Database Keys:
113: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file
115: Usage:
116: .vb
117: PetscInitialize(...);
118: PetscLogNestedBegin();
119: ... code ...
120: PetscLogView(viewer);
121: PetscFinalize();
122: .ve
124: Level: advanced
126: .seealso: PetscLogDump(), PetscLogAllBegin(), PetscLogView(), PetscLogTraceBegin(), PetscLogDefaultBegin()
127: @*/
128: PetscErrorCode PetscLogNestedBegin(void)
129: {
132: nNestedEventsAllocated = 10;
133: PetscMalloc1(nNestedEventsAllocated,&nestedEvents);
134: dftParentActive = DFT_ID_AWAKE;
135: nNestedEvents =1;
137: /* 'Awake' is nested event 0. It has no parents */
138: nestedEvents[0].nstEvent = 0;
139: nestedEvents[0].nParents = 0;
140: nestedEvents[0].dftParentsSorted = NULL;
141: nestedEvents[0].dftEvents = NULL;
142: nestedEvents[0].dftParents = NULL;
143: nestedEvents[0].dftEventsSorted = NULL;
145: PetscLogSet(PetscLogEventBeginNested,PetscLogEventEndNested);
146: return 0;
147: }
149: /* Delete the data structures for the nested timers */
150: PetscErrorCode PetscLogNestedEnd(void)
151: {
152: int i;
154: if (!nestedEvents) return 0;
155: for (i=0; i<nNestedEvents; i++) {
156: PetscFree4(nestedEvents[i].dftParentsSorted,nestedEvents[i].dftEventsSorted,nestedEvents[i].dftParents,nestedEvents[i].dftEvents);
157: }
158: PetscFree(nestedEvents);
159: nestedEvents = NULL;
160: nNestedEvents = 0;
161: nNestedEventsAllocated = 0;
162: return 0;
163: }
165: /*
166: UTILITIES: FIND STUFF IN SORTED ARRAYS
168: dftIndex - index to be found
169: dftArray - sorted array of PetscLogEvent-ids
170: narray - dimension of dftArray
171: entry - entry in the array where dftIndex may be found;
173: if dftArray[entry] != dftIndex, then dftIndex is not part of dftArray
174: In that case, the dftIndex can be inserted at this entry.
175: */
176: static PetscErrorCode PetscLogEventFindDefaultTimer(PetscLogEvent dftIndex,const PetscLogEvent *dftArray,int narray,int *entry)
177: {
178: if (narray==0 || dftIndex <= dftArray[0]) {
179: *entry = 0;
180: } else if (dftIndex > dftArray[narray-1]) {
181: *entry = narray;
182: } else {
183: int ihigh = narray-1, ilow=0;
184: while (ihigh>ilow) {
185: const int imiddle = (ihigh+ilow)/2;
186: if (dftArray[imiddle] > dftIndex) {
187: ihigh = imiddle;
188: } else if (dftArray[imiddle]<dftIndex) {
189: ilow = imiddle+1;
190: } else {
191: ihigh = imiddle;
192: ilow = imiddle;
193: }
194: }
195: *entry = ihigh;
196: }
197: return 0;
198: }
200: /*
201: Utility: find the nested event with given identification
203: nstEvent - Nested event to be found
204: entry - entry in the nestedEvents where nstEvent may be found;
206: if nestedEvents[entry].nstEvent != nstEvent, then index is not part of iarray
207: */
208: static PetscErrorCode PetscLogEventFindNestedTimer(NestedEventId nstEvent,int *entry)
209: {
210: if (nNestedEvents==0 || nstEvent <= nestedEvents[0].nstEvent) {
211: *entry = 0;
212: } else if (nstEvent > nestedEvents[nNestedEvents-1].nstEvent) {
213: *entry = nNestedEvents;
214: } else {
215: int ihigh = nNestedEvents-1, ilow = 0;
216: while (ihigh>ilow) {
217: const int imiddle = (ihigh+ilow)/2;
218: if (nestedEvents[imiddle].nstEvent > nstEvent) {
219: ihigh = imiddle;
220: } else if (nestedEvents[imiddle].nstEvent<nstEvent) {
221: ilow = imiddle+1;
222: } else {
223: ihigh = imiddle;
224: ilow = imiddle;
225: }
226: }
227: *entry = ihigh;
228: }
229: return 0;
230: }
232: /*
233: Nested logging is not prepared yet to support user-defined logging stages, so for now we force logging on the main stage.
234: Using PetscLogStage{Push/Pop}() would be more appropriate, but these two calls do extra bookkeeping work we don't need.
235: */
237: #define MAINSTAGE 0
239: static PetscLogStage savedStage = 0;
241: static inline PetscErrorCode PetscLogStageOverride(void)
242: {
243: PetscStageLog stageLog = petsc_stageLog;
245: if (stageLog->curStage == MAINSTAGE) return 0;
246: savedStage = stageLog->curStage;
247: stageLog->curStage = MAINSTAGE;
248: PetscIntStackPush(stageLog->stack, MAINSTAGE);
249: return 0;
250: }
252: static inline PetscErrorCode PetscLogStageRestore(void)
253: {
254: PetscStageLog stageLog = petsc_stageLog;
256: if (savedStage == MAINSTAGE) return 0;
257: stageLog->curStage = savedStage;
258: PetscIntStackPop(stageLog->stack, &savedStage);
259: return 0;
260: }
262: /******************************************************************************************/
263: /* Start a nested event */
264: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
265: {
266: int entry, pentry, tentry,i;
267: PetscLogEvent dftEvent;
269: PetscLogEventFindNestedTimer(nstEvent, &entry);
270: if (entry>=nNestedEvents || nestedEvents[entry].nstEvent != nstEvent) {
271: /* Nested event doesn't exist yet: create it */
273: if (nNestedEvents==nNestedEventsAllocated) {
274: /* Enlarge and re-allocate nestedEvents if needed */
275: PetscNestedEvent *tmp = nestedEvents;
276: PetscMalloc1(2*nNestedEvents,&nestedEvents);
277: nNestedEventsAllocated*=2;
278: PetscArraycpy(nestedEvents, tmp, nNestedEvents);
279: PetscFree(tmp);
280: }
282: /* Clear space in nestedEvents for new nested event */
283: nNestedEvents++;
284: for (i = nNestedEvents-1; i>entry; i--) {
285: nestedEvents[i] = nestedEvents[i-1];
286: }
288: /* Create event in nestedEvents */
289: nestedEvents[entry].nstEvent = nstEvent;
290: nestedEvents[entry].nParents=1;
291: PetscMalloc4(1,&nestedEvents[entry].dftParentsSorted,1,&nestedEvents[entry].dftEventsSorted,1,&nestedEvents[entry].dftParents,1,&nestedEvents[entry].dftEvents);
293: /* Fill in new event */
294: pentry = 0;
295: dftEvent = (PetscLogEvent) nstEvent;
297: nestedEvents[entry].nstEvent = nstEvent;
298: nestedEvents[entry].dftParents[pentry] = dftParentActive;
299: nestedEvents[entry].dftEvents[pentry] = dftEvent;
300: nestedEvents[entry].dftParentsSorted[pentry] = dftParentActive;
301: nestedEvents[entry].dftEventsSorted[pentry] = dftEvent;
303: } else {
304: /* Nested event exists: find current dftParentActive among parents */
305: PetscLogEvent *dftParentsSorted = nestedEvents[entry].dftParentsSorted;
306: PetscLogEvent *dftEvents = nestedEvents[entry].dftEvents;
307: int nParents = nestedEvents[entry].nParents;
309: PetscLogEventFindDefaultTimer( dftParentActive, dftParentsSorted, nParents, &pentry);
311: if (pentry>=nParents || dftParentActive != dftParentsSorted[pentry]) {
312: /* dftParentActive not in the list: add it to the list */
313: int i;
314: PetscLogEvent *dftParents = nestedEvents[entry].dftParents;
315: PetscLogEvent *dftEventsSorted = nestedEvents[entry].dftEventsSorted;
316: char name[100];
318: /* Register a new default timer */
319: sprintf(name, "%d -> %d", (int) dftParentActive, (int) nstEvent);
320: PetscLogEventRegister(name, 0, &dftEvent);
321: PetscLogEventFindDefaultTimer( dftEvent, dftEventsSorted, nParents, &tentry);
323: /* Reallocate parents and dftEvents to make space for new parent */
324: PetscMalloc4(1+nParents,&nestedEvents[entry].dftParentsSorted,1+nParents,&nestedEvents[entry].dftEventsSorted,1+nParents,&nestedEvents[entry].dftParents,1+nParents,&nestedEvents[entry].dftEvents);
325: PetscArraycpy(nestedEvents[entry].dftParentsSorted, dftParentsSorted, nParents);
326: PetscArraycpy(nestedEvents[entry].dftEventsSorted, dftEventsSorted, nParents);
327: PetscArraycpy(nestedEvents[entry].dftParents, dftParents, nParents);
328: PetscArraycpy(nestedEvents[entry].dftEvents, dftEvents, nParents);
329: PetscFree4(dftParentsSorted,dftEventsSorted,dftParents,dftEvents);
331: dftParents = nestedEvents[entry].dftParents;
332: dftEvents = nestedEvents[entry].dftEvents;
333: dftParentsSorted = nestedEvents[entry].dftParentsSorted;
334: dftEventsSorted = nestedEvents[entry].dftEventsSorted;
336: nestedEvents[entry].nParents++;
337: nParents++;
339: for (i = nParents-1; i>pentry; i--) {
340: dftParentsSorted[i] = dftParentsSorted[i-1];
341: dftEvents[i] = dftEvents[i-1];
342: }
343: for (i = nParents-1; i>tentry; i--) {
344: dftParents[i] = dftParents[i-1];
345: dftEventsSorted[i] = dftEventsSorted[i-1];
346: }
348: /* Fill in the new default timer */
349: dftParentsSorted[pentry] = dftParentActive;
350: dftEvents[pentry] = dftEvent;
351: dftParents[tentry] = dftParentActive;
352: dftEventsSorted[tentry] = dftEvent;
354: } else {
355: /* dftParentActive was found: find the corresponding default 'dftEvent'-timer */
356: dftEvent = nestedEvents[entry].dftEvents[pentry];
357: }
358: }
360: /* Start the default 'dftEvent'-timer and update the dftParentActive */
361: PetscLogStageOverride();
362: PetscLogEventBeginDefault(dftEvent,t,o1,o2,o3,o4);
363: PetscLogStageRestore();
364: dftParentActive = dftEvent;
365: return 0;
366: }
368: /* End a nested event */
369: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
370: {
371: int entry, pentry, nParents;
372: PetscLogEvent *dftEventsSorted;
374: /* Find the nested event */
375: PetscLogEventFindNestedTimer(nstEvent, &entry);
378: dftEventsSorted = nestedEvents[entry].dftEventsSorted;
379: nParents = nestedEvents[entry].nParents;
381: /* Find the current default timer among the 'dftEvents' of this event */
382: PetscLogEventFindDefaultTimer( dftParentActive, dftEventsSorted, nParents, &pentry);
387: /* Stop the default timer and update the dftParentActive */
388: PetscLogStageOverride();
389: PetscLogEventEndDefault(dftParentActive,t,o1,o2,o3,o4);
390: PetscLogStageRestore();
391: dftParentActive = nestedEvents[entry].dftParents[pentry];
392: return 0;
393: }
395: /*@
396: PetscLogSetThreshold - Set the threshold time for logging the events; this is a percentage out of 100, so 1. means any event
397: that takes 1 or more percent of the time.
399: Logically Collective over PETSC_COMM_WORLD
401: Input Parameter:
402: . newThresh - the threshold to use
404: Output Parameter:
405: . oldThresh - the previously set threshold value
407: Options Database Keys:
408: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file
410: Usage:
411: .vb
412: PetscInitialize(...);
413: PetscLogNestedBegin();
414: PetscLogSetThreshold(0.1,&oldthresh);
415: ... code ...
416: PetscLogView(viewer);
417: PetscFinalize();
418: .ve
420: Level: advanced
422: .seealso: PetscLogDump(), PetscLogAllBegin(), PetscLogView(), PetscLogTraceBegin(), PetscLogDefaultBegin(),
423: PetscLogNestedBegin()
424: @*/
425: PetscErrorCode PetscLogSetThreshold(PetscLogDouble newThresh, PetscLogDouble *oldThresh)
426: {
427: if (oldThresh) *oldThresh = thresholdTime;
428: if (newThresh == PETSC_DECIDE) newThresh = 0.01;
429: if (newThresh == PETSC_DEFAULT) newThresh = 0.01;
430: thresholdTime = PetscMax(newThresh, 0.0);
431: return 0;
432: }
434: static PetscErrorCode PetscPrintExeSpecs(PetscViewer viewer)
435: {
436: char arch[128],hostname[128],username[128],pname[PETSC_MAX_PATH_LEN],date[128];
437: char version[256], buildoptions[128] = "";
438: PetscMPIInt size;
439: size_t len;
441: MPI_Comm_size(PetscObjectComm((PetscObject)viewer),&size);
442: PetscGetArchType(arch,sizeof(arch));
443: PetscGetHostName(hostname,sizeof(hostname));
444: PetscGetUserName(username,sizeof(username));
445: PetscGetProgramName(pname,sizeof(pname));
446: PetscGetDate(date,sizeof(date));
447: PetscGetVersion(version,sizeof(version));
449: PetscViewerXMLStartSection(viewer, "runspecification", "Run Specification");
450: PetscViewerXMLPutString( viewer, "executable" , "Executable" , pname);
451: PetscViewerXMLPutString( viewer, "architecture", "Architecture" , arch);
452: PetscViewerXMLPutString( viewer, "hostname" , "Host" , hostname);
453: PetscViewerXMLPutInt( viewer, "nprocesses" , "Number of processes", size);
454: PetscViewerXMLPutString( viewer, "user" , "Run by user" , username);
455: PetscViewerXMLPutString( viewer, "date" , "Started at" , date);
456: PetscViewerXMLPutString( viewer, "petscrelease", "Petsc Release", version);
458: if (PetscDefined(USE_DEBUG)) {
459: PetscStrlcat(buildoptions, "Debug ", sizeof(buildoptions));
460: }
461: if (PetscDefined(USE_COMPLEX)) {
462: PetscStrlcat(buildoptions, "Complex ", sizeof(buildoptions));
463: }
464: if (PetscDefined(USE_REAL_SINGLE)) {
465: PetscStrlcat(buildoptions, "Single ", sizeof(buildoptions));
466: } else if (PetscDefined(USE_REAL___FLOAT128)) {
467: PetscStrlcat(buildoptions, "Quadruple ", sizeof(buildoptions));
468: } else if (PetscDefined(USE_REAL___FP16)) {
469: PetscStrlcat(buildoptions, "Half ", sizeof(buildoptions));
470: }
471: if (PetscDefined(USE_64BIT_INDICES)) {
472: PetscStrlcat(buildoptions, "Int64 ", sizeof(buildoptions));
473: }
474: #if defined(__cplusplus)
475: PetscStrlcat(buildoptions, "C++ ", sizeof(buildoptions));
476: #endif
477: PetscStrlen(buildoptions,&len);
478: if (len) {
479: PetscViewerXMLPutString(viewer, "petscbuildoptions", "Petsc build options", buildoptions);
480: }
481: PetscViewerXMLEndSection(viewer, "runspecification");
482: return 0;
483: }
485: /* Print the global performance: max, max/min, average and total of
486: * time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
487: */
488: static PetscErrorCode PetscPrintXMLGlobalPerformanceElement(PetscViewer viewer, const char *name, const char *desc, PetscLogDouble local_val, const PetscBool print_average, const PetscBool print_total)
489: {
490: PetscLogDouble min, tot, ratio, avg;
491: MPI_Comm comm;
492: PetscMPIInt rank, size;
493: PetscLogDouble valrank[2], max[2];
495: PetscObjectGetComm((PetscObject)viewer,&comm);
496: MPI_Comm_size(PetscObjectComm((PetscObject)viewer),&size);
497: MPI_Comm_rank(comm, &rank);
499: valrank[0] = local_val;
500: valrank[1] = (PetscLogDouble) rank;
501: MPIU_Allreduce(&local_val, &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
502: MPIU_Allreduce(valrank, &max, 1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
503: MPIU_Allreduce(&local_val, &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
504: avg = tot/((PetscLogDouble) size);
505: if (min != 0.0) ratio = max[0]/min;
506: else ratio = 0.0;
508: PetscViewerXMLStartSection(viewer, name, desc);
509: PetscViewerXMLPutDouble(viewer, "max", NULL, max[0], "%e");
510: PetscViewerXMLPutInt( viewer, "maxrank" , "rank at which max was found" , (PetscMPIInt) max[1]);
511: PetscViewerXMLPutDouble(viewer, "ratio", NULL, ratio, "%f");
512: if (print_average) {
513: PetscViewerXMLPutDouble(viewer, "average", NULL, avg, "%e");
514: }
515: if (print_total) {
516: PetscViewerXMLPutDouble(viewer, "total", NULL, tot, "%e");
517: }
518: PetscViewerXMLEndSection(viewer, name);
519: return 0;
520: }
522: /* Print the global performance: max, max/min, average and total of
523: * time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
524: */
525: static PetscErrorCode PetscPrintGlobalPerformance(PetscViewer viewer, PetscLogDouble locTotalTime)
526: {
527: PetscLogDouble flops, mem, red, mess;
528: const PetscBool print_total_yes = PETSC_TRUE,
529: print_total_no = PETSC_FALSE,
530: print_average_no = PETSC_FALSE,
531: print_average_yes = PETSC_TRUE;
533: /* Must preserve reduction count before we go on */
534: red = petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;
536: /* Calculate summary information */
537: PetscViewerXMLStartSection(viewer, "globalperformance", "Global performance");
539: /* Time */
540: PetscPrintXMLGlobalPerformanceElement(viewer, "time", "Time (sec)", locTotalTime, print_average_yes, print_total_no);
542: /* Objects */
543: PetscPrintXMLGlobalPerformanceElement(viewer, "objects", "Objects", (PetscLogDouble) petsc_numObjects, print_average_yes, print_total_no);
545: /* Flop */
546: PetscPrintXMLGlobalPerformanceElement(viewer, "mflop", "MFlop", petsc_TotalFlops/1.0E6, print_average_yes, print_total_yes);
548: /* Flop/sec -- Must talk to Barry here */
549: if (locTotalTime != 0.0) flops = petsc_TotalFlops/locTotalTime;
550: else flops = 0.0;
551: PetscPrintXMLGlobalPerformanceElement(viewer, "mflops", "MFlop/sec", flops/1.0E6, print_average_yes, print_total_yes);
553: /* Memory */
554: PetscMallocGetMaximumUsage(&mem);
555: if (mem > 0.0) {
556: PetscPrintXMLGlobalPerformanceElement(viewer, "memory", "Memory (MiB)", mem/1024.0/1024.0, print_average_yes, print_total_yes);
557: }
558: /* Messages */
559: mess = 0.5*(petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct);
560: PetscPrintXMLGlobalPerformanceElement(viewer, "messagetransfers", "MPI Message Transfers", mess, print_average_yes, print_total_yes);
562: /* Message Volume */
563: mess = 0.5*(petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len);
564: PetscPrintXMLGlobalPerformanceElement(viewer, "messagevolume", "MPI Message Volume (MiB)", mess/1024.0/1024.0, print_average_yes, print_total_yes);
566: /* Reductions */
567: PetscPrintXMLGlobalPerformanceElement(viewer, "reductions", "MPI Reductions", red , print_average_no, print_total_no);
568: PetscViewerXMLEndSection(viewer, "globalperformance");
569: return 0;
570: }
572: typedef struct {
573: PetscLogEvent dftEvent;
574: NestedEventId nstEvent;
575: PetscLogEvent dftParent;
576: NestedEventId nstParent;
577: PetscBool own;
578: int depth;
579: NestedEventId* nstPath;
580: } PetscNestedEventTree;
582: /* Compare timers to sort them in the tree */
583: static int compareTreeItems(const void *item1_, const void *item2_)
584: {
585: int i;
586: PetscNestedEventTree *item1 = (PetscNestedEventTree *) item1_;
587: PetscNestedEventTree *item2 = (PetscNestedEventTree *) item2_;
589: for (i=0; i<PetscMin(item1->depth,item2->depth); i++) {
590: if (item1->nstPath[i]<item2->nstPath[i]) return -1;
591: if (item1->nstPath[i]>item2->nstPath[i]) return +1;
592: }
593: if (item1->depth < item2->depth) return -1;
594: if (item1->depth > item2->depth) return 1;
595: return 0;
596: }
597: /*
598: * Do MPI communication to get the complete, nested calling tree for all processes: there may be
599: * calls that happen in some processes, but not in others.
600: *
601: * The output, tree[nTimers] is an array of PetscNestedEventTree-structs.
602: * The tree is sorted so that the timers can be printed in the order of appearance.
603: *
604: * For tree-items which appear in the trees of multiple processes (which will be most items), the
605: * following rule is followed:
606: * + if information from my own process is available, then that is the information stored in tree.
607: * otherwise it is some other process's information.
608: */
609: static PetscErrorCode PetscLogNestedTreeCreate(PetscViewer viewer, PetscNestedEventTree **p_tree, int *p_nTimers)
610: {
611: PetscNestedEventTree *tree = NULL, *newTree;
612: int *treeIndices;
613: int nTimers, totalNTimers, i, j, iTimer0, maxDefaultTimer;
614: int yesno;
615: PetscBool done;
616: int maxdepth;
617: int depth;
618: int illegalEvent;
619: int iextra;
620: NestedEventId *nstPath, *nstMyPath;
621: MPI_Comm comm;
623: PetscObjectGetComm((PetscObject)viewer,&comm);
625: /* Calculate memory needed to store everybody's information and allocate tree */
626: nTimers = 0;
627: for (i=0; i<nNestedEvents; i++) nTimers += nestedEvents[i].nParents;
629: PetscMalloc1(nTimers,&tree);
631: /* Fill tree with readily available information */
632: iTimer0 = 0;
633: maxDefaultTimer =0;
634: for (i=0; i<nNestedEvents; i++) {
635: int nParents = nestedEvents[i].nParents;
636: NestedEventId nstEvent = nestedEvents[i].nstEvent;
637: PetscLogEvent *dftParentsSorted = nestedEvents[i].dftParentsSorted;
638: PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
639: for (j=0; j<nParents; j++) {
640: maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);
642: tree[iTimer0+j].dftEvent = dftEvents[j];
643: tree[iTimer0+j].nstEvent = nstEvent;
644: tree[iTimer0+j].dftParent = dftParentsSorted[j];
645: tree[iTimer0+j].own = PETSC_TRUE;
647: tree[iTimer0+j].nstParent = 0;
648: tree[iTimer0+j].depth = 0;
649: tree[iTimer0+j].nstPath = NULL;
650: }
651: iTimer0 += nParents;
652: }
654: /* Calculate the global maximum for the default timer index, so array treeIndices can
655: * be allocated only once */
656: MPIU_Allreduce(&maxDefaultTimer, &j, 1, MPI_INT, MPI_MAX, comm);
657: maxDefaultTimer = j;
659: /* Find default timer's place in the tree */
660: PetscCalloc1(maxDefaultTimer+1,&treeIndices);
661: treeIndices[0] = 0;
662: for (i=0; i<nTimers; i++) {
663: PetscLogEvent dftEvent = tree[i].dftEvent;
664: treeIndices[dftEvent] = i;
665: }
667: /* Find each dftParent's nested identification */
668: for (i=0; i<nTimers; i++) {
669: PetscLogEvent dftParent = tree[i].dftParent;
670: if (dftParent!= DFT_ID_AWAKE) {
671: int j = treeIndices[dftParent];
672: tree[i].nstParent = tree[j].nstEvent;
673: }
674: }
676: /* Find depths for each timer path */
677: done = PETSC_FALSE;
678: maxdepth = 0;
679: while (!done) {
680: done = PETSC_TRUE;
681: for (i=0; i<nTimers; i++) {
682: if (tree[i].dftParent == DFT_ID_AWAKE) {
683: tree[i].depth = 1;
684: maxdepth = PetscMax(1,maxdepth);
685: } else {
686: int j = treeIndices[tree[i].dftParent];
687: depth = 1+tree[j].depth;
688: if (depth>tree[i].depth) {
689: done = PETSC_FALSE;
690: tree[i].depth = depth;
691: maxdepth = PetscMax(depth,maxdepth);
692: }
693: }
694: }
695: }
697: /* Allocate the paths in the entire tree */
698: for (i=0; i<nTimers; i++) {
699: depth = tree[i].depth;
700: PetscCalloc1(depth,&tree[i].nstPath);
701: }
703: /* Calculate the paths for all timers */
704: for (depth=1; depth<=maxdepth; depth++) {
705: for (i=0; i<nTimers; i++) {
706: if (tree[i].depth==depth) {
707: if (depth>1) {
708: int j = treeIndices[tree[i].dftParent];
709: PetscArraycpy(tree[i].nstPath,tree[j].nstPath,depth-1);
710: }
711: tree[i].nstPath[depth-1] = tree[i].nstEvent;
712: }
713: }
714: }
715: PetscFree(treeIndices);
717: /* Sort the tree on basis of the paths */
718: qsort(tree, nTimers, sizeof(PetscNestedEventTree), compareTreeItems);
720: /* Allocate an array to store paths */
721: depth = maxdepth;
722: MPIU_Allreduce(&depth, &maxdepth, 1, MPI_INT, MPI_MAX, comm);
723: PetscMalloc1(maxdepth+1, &nstPath);
724: PetscMalloc1(maxdepth+1, &nstMyPath);
726: /* Find an illegal nested event index (1+largest nested event index) */
727: illegalEvent = 1+nestedEvents[nNestedEvents-1].nstEvent;
728: i = illegalEvent;
729: MPIU_Allreduce(&i, &illegalEvent, 1, MPI_INT, MPI_MAX, comm);
731: /* First, detect timers which are not available in this process, but are available in others
732: * Allocate a new tree, that can contain all timers
733: * Then, fill the new tree with all (own and not-own) timers */
734: newTree= NULL;
735: for (yesno=0; yesno<=1; yesno++) {
736: depth = 1;
737: i = 0;
738: iextra = 0;
739: while (depth>0) {
740: int j;
741: PetscBool same;
743: /* Construct the next path in this process's tree:
744: * if necessary, supplement with invalid path entries */
745: depth++;
747: if (i<nTimers) {
748: for (j=0; j<tree[i].depth; j++) nstMyPath[j] = tree[i].nstPath[j];
749: for (j=tree[i].depth; j<depth; j++) nstMyPath[j] = illegalEvent;
750: } else {
751: for (j=0; j<depth; j++) nstMyPath[j] = illegalEvent;
752: }
754: /* Communicate with other processes to obtain the next path and its depth */
755: MPIU_Allreduce(nstMyPath, nstPath, depth, MPI_INT, MPI_MIN, comm);
756: for (j=depth-1; (int) j>=0; j--) {
757: if (nstPath[j]==illegalEvent) depth=j;
758: }
760: if (depth>0) {
761: /* If the path exists */
763: /* check whether the next path is the same as this process's next path */
764: same = PETSC_TRUE;
765: for (j=0; same && j<depth; j++) { same = (same && nstMyPath[j] == nstPath[j]) ? PETSC_TRUE : PETSC_FALSE;}
767: if (same) {
768: /* Register 'own path' */
769: if (newTree) newTree[i+iextra] = tree[i];
770: i++;
771: } else {
772: /* Register 'not an own path' */
773: if (newTree) {
774: newTree[i+iextra].nstEvent = nstPath[depth-1];
775: newTree[i+iextra].own = PETSC_FALSE;
776: newTree[i+iextra].depth = depth;
777: PetscMalloc1(depth, &newTree[i+iextra].nstPath);
778: for (j=0; j<depth; j++) {newTree[i+iextra].nstPath[j] = nstPath[j];}
780: newTree[i+iextra].dftEvent = 0;
781: newTree[i+iextra].dftParent = 0;
782: newTree[i+iextra].nstParent = 0;
783: }
784: iextra++;
785: }
787: }
788: }
790: /* Determine the size of the complete tree (with own and not-own timers) and allocate the new tree */
791: totalNTimers = nTimers + iextra;
792: if (!newTree) {
793: PetscMalloc1(totalNTimers, &newTree);
794: }
795: }
796: PetscFree(nstPath);
797: PetscFree(nstMyPath);
798: PetscFree(tree);
799: tree = newTree;
800: newTree = NULL;
802: /* Set return value and return */
803: *p_tree = tree;
804: *p_nTimers = totalNTimers;
805: return 0;
806: }
808: /*
809: * Delete the nested timer tree
810: */
811: static PetscErrorCode PetscLogNestedTreeDestroy(PetscNestedEventTree *tree, int nTimers)
812: {
813: int i;
815: for (i=0; i<nTimers; i++) {
816: PetscFree(tree[i].nstPath);
817: }
818: PetscFree(tree);
819: return 0;
820: }
822: /* Print the global performance: max, max/min, average and total of
823: * time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
824: */
825: static PetscErrorCode PetscPrintXMLNestedLinePerfResults(PetscViewer viewer,const char *name,PetscLogDouble value,PetscLogDouble minthreshold,PetscLogDouble maxthreshold,PetscLogDouble minmaxtreshold)
826: {
827: MPI_Comm comm; /* MPI communicator in reduction */
828: PetscMPIInt rank; /* rank of this process */
829: PetscLogDouble val_in[2], max[2], min[2];
830: PetscLogDouble minvalue, maxvalue, tot;
831: PetscMPIInt size;
832: PetscMPIInt minLoc, maxLoc;
834: PetscObjectGetComm((PetscObject)viewer,&comm);
835: MPI_Comm_size(comm, &size);
836: MPI_Comm_rank(comm, &rank);
837: val_in[0] = value;
838: val_in[1] = (PetscLogDouble) rank;
839: MPIU_Allreduce(val_in, max, 1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
840: MPIU_Allreduce(val_in, min, 1, MPIU_2PETSCLOGDOUBLE, MPI_MINLOC, comm);
841: maxvalue = max[0];
842: maxLoc = (PetscMPIInt) max[1];
843: minvalue = min[0];
844: minLoc = (PetscMPIInt) min[1];
845: MPIU_Allreduce(&value, &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
847: if (maxvalue<maxthreshold && minvalue>=minthreshold) {
848: /* One call per parent or NO value: don't print */
849: } else {
850: PetscViewerXMLStartSection(viewer, name, NULL);
851: if (maxvalue>minvalue*minmaxtreshold) {
852: PetscViewerXMLPutDouble(viewer, "avgvalue", NULL, tot/size, "%g");
853: PetscViewerXMLPutDouble(viewer, "minvalue", NULL, minvalue, "%g");
854: PetscViewerXMLPutDouble(viewer, "maxvalue", NULL, maxvalue, "%g");
855: PetscViewerXMLPutInt( viewer, "minloc" , NULL, minLoc);
856: PetscViewerXMLPutInt( viewer, "maxloc" , NULL, maxLoc);
857: } else {
858: PetscViewerXMLPutDouble(viewer, "value", NULL, tot/size, "%g");
859: }
860: PetscViewerXMLEndSection(viewer, name);
861: }
862: return 0;
863: }
865: #define N_COMM 8
866: static PetscErrorCode PetscLogNestedTreePrintLine(PetscViewer viewer,PetscEventPerfInfo perfInfo,PetscLogDouble countsPerCall,int parentCount,int depth,const char *name,PetscLogDouble totalTime,PetscBool *isPrinted)
867: {
868: PetscLogDouble time = perfInfo.time;
869: PetscLogDouble timeMx;
870: MPI_Comm comm;
872: PetscObjectGetComm((PetscObject)viewer,&comm);
873: MPIU_Allreduce(&time, &timeMx, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
874: *isPrinted = ((timeMx/totalTime) >= THRESHOLD) ? PETSC_TRUE : PETSC_FALSE;
875: if (*isPrinted) {
876: PetscViewerXMLStartSection(viewer, "event", NULL);
877: PetscViewerXMLPutString(viewer, "name", NULL, name);
878: PetscPrintXMLNestedLinePerfResults(viewer, "time", time/totalTime*100.0, 0, 0, 1.02);
879: PetscPrintXMLNestedLinePerfResults(viewer, "ncalls", parentCount>0 ? countsPerCall : 0, 0.99, 1.01, 1.02);
880: PetscPrintXMLNestedLinePerfResults(viewer, "mflops", time>=timeMx*0.001 ? 1e-6*perfInfo.flops/time : 0, 0, 0.01, 1.05);
881: PetscPrintXMLNestedLinePerfResults(viewer, "mbps",time>=timeMx*0.001 ? perfInfo.messageLength/(1024*1024*time) : 0, 0, 0.01, 1.05);
882: PetscPrintXMLNestedLinePerfResults(viewer, "nreductsps", time>=timeMx*0.001 ? perfInfo.numReductions/time : 0, 0, 0.01, 1.05);
883: }
884: return 0;
885: }
887: /* Count the number of times the parent event was called */
889: static int countParents( const PetscNestedEventTree *tree, PetscEventPerfInfo *eventPerfInfo, int i)
890: {
891: if (tree[i].depth<=1) {
892: return 1; /* Main event: only once */
893: } else if (!tree[i].own) {
894: return 1; /* This event didn't happen in this process, but did in another */
895: } else {
896: int iParent;
897: for (iParent=i-1; iParent>=0; iParent--) {
898: if (tree[iParent].depth == tree[i].depth-1) break;
899: }
900: if (tree[iParent].depth != tree[i].depth-1) {
901: /* ***** Internal error: cannot find parent */
902: return -2;
903: } else {
904: PetscLogEvent dftEvent = tree[iParent].dftEvent;
905: return eventPerfInfo[dftEvent].count;
906: }
907: }
908: }
910: typedef struct {
911: int id;
912: PetscLogDouble val;
913: } PetscSortItem;
915: static int compareSortItems(const void *item1_, const void *item2_)
916: {
917: PetscSortItem *item1 = (PetscSortItem *) item1_;
918: PetscSortItem *item2 = (PetscSortItem *) item2_;
919: if (item1->val > item2->val) return -1;
920: if (item1->val < item2->val) return +1;
921: return 0;
922: }
924: /*
925: * Find the number of child events.
926: */
927: static PetscErrorCode PetscLogNestedTreeGetChildrenCount(const PetscNestedEventTree *tree,int nTimers,int iStart,int depth,int *nChildren)
928: {
929: int n=0;
931: for (int i=iStart+1; i<nTimers; i++) {
932: if (tree[i].depth <= depth) break;
933: if (tree[i].depth == depth + 1) n++;
934: }
935: *nChildren = n;
936: return 0;
937: }
939: /*
940: * Initialize child event sort items with ID and times.
941: */
942: static PetscErrorCode PetscLogNestedTreeSetChildrenSortItems(const PetscViewer viewer,const PetscNestedEventTree *tree,int nTimers,int iStart,int depth,int nChildren,PetscSortItem **children)
943: {
944: MPI_Comm comm;
945: PetscLogDouble *times, *maxTimes;
946: PetscStageLog stageLog;
947: PetscEventPerfInfo *eventPerfInfo;
948: const int stage = MAINSTAGE;
950: PetscObjectGetComm((PetscObject)viewer,&comm);
951: PetscLogGetStageLog(&stageLog);
952: eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
954: if (nChildren>0) {
955: /* Create an array for the id-s and maxTimes of the children,
956: * leaving 2 spaces for self-time and other-time */
958: PetscMalloc1(nChildren+2,children);
959: nChildren = 0;
960: for (int i=iStart+1; i<nTimers; i++) {
961: if (tree[i].depth<=depth) break;
962: if (tree[i].depth == depth + 1) {
963: (*children)[nChildren].id = i;
964: (*children)[nChildren].val = eventPerfInfo[tree[i].dftEvent].time ;
965: nChildren++;
966: }
967: }
969: /* Calculate the children's maximum times, to see whether children will be ignored or printed */
970: PetscMalloc1(nChildren,×);
971: for (int i=0; i<nChildren; i++) { times[i] = (*children)[i].val; }
973: PetscMalloc1(nChildren,&maxTimes);
974: MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
975: PetscFree(times);
977: for (int i=0; i<nChildren; i++) { (*children)[i].val = maxTimes[i]; }
978: PetscFree(maxTimes);
979: }
980: return 0;
981: }
983: /*
984: * Set 'self' and 'other' performance info.
985: */
986: static PetscErrorCode PetscLogNestedTreeSetSelfOtherPerfInfo(const PetscNestedEventTree *tree,int iStart,PetscLogDouble totalTime,const PetscSortItem *children,int nChildren,
987: PetscEventPerfInfo *myPerfInfo,PetscEventPerfInfo *selfPerfInfo,PetscEventPerfInfo *otherPerfInfo,int *parentCount,PetscLogDouble *countsPerCall)
988: {
989: const int stage = MAINSTAGE;
990: PetscStageLog stageLog;
991: PetscEventPerfInfo *eventPerfInfo;
993: PetscLogGetStageLog(&stageLog);
994: eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
995: if (!tree[iStart].own) {
996: /* Set values for a timer that was not activated in this process
997: * (but was, in other processes of this run) */
998: PetscMemzero(myPerfInfo,sizeof(*myPerfInfo));
1000: *selfPerfInfo = *myPerfInfo;
1001: *otherPerfInfo = *myPerfInfo;
1003: *parentCount = 1;
1004: *countsPerCall = 0;
1005: } else {
1006: /* Set the values for a timer that was activated in this process */
1007: PetscLogEvent dftEvent = tree[iStart].dftEvent;
1009: *parentCount = countParents(tree,eventPerfInfo,iStart);
1010: *myPerfInfo = eventPerfInfo[dftEvent];
1011: *countsPerCall = (PetscLogDouble) myPerfInfo->count / (PetscLogDouble) *parentCount;
1013: *selfPerfInfo = *myPerfInfo;
1014: otherPerfInfo->time = 0;
1015: otherPerfInfo->flops = 0;
1016: otherPerfInfo->numMessages = 0;
1017: otherPerfInfo->messageLength = 0;
1018: otherPerfInfo->numReductions = 0;
1020: for (int i=0; i<nChildren; i++) {
1021: /* For all child counters: subtract the child values from self-timers */
1023: PetscLogEvent dftChild = tree[children[i].id].dftEvent;
1024: PetscEventPerfInfo childPerfInfo = eventPerfInfo[dftChild];
1026: selfPerfInfo->time -= childPerfInfo.time;
1027: selfPerfInfo->flops -= childPerfInfo.flops;
1028: selfPerfInfo->numMessages -= childPerfInfo.numMessages;
1029: selfPerfInfo->messageLength -= childPerfInfo.messageLength;
1030: selfPerfInfo->numReductions -= childPerfInfo.numReductions;
1032: if ((children[i].val/totalTime) < THRESHOLD) {
1033: /* Add them to 'other' if the time is ignored in the output */
1034: otherPerfInfo->time += childPerfInfo.time;
1035: otherPerfInfo->flops += childPerfInfo.flops;
1036: otherPerfInfo->numMessages += childPerfInfo.numMessages;
1037: otherPerfInfo->messageLength += childPerfInfo.messageLength;
1038: otherPerfInfo->numReductions += childPerfInfo.numReductions;
1039: }
1040: }
1041: }
1042: return 0;
1043: }
1045: /*
1046: * Set max times across ranks for 'self' and 'other'.
1047: */
1048: static PetscErrorCode PetscLogNestedTreeSetMaxTimes(MPI_Comm comm,int nChildren,const PetscEventPerfInfo selfPerfInfo,const PetscEventPerfInfo otherPerfInfo,PetscSortItem *children)
1049: {
1050: PetscLogDouble times[2], maxTimes[2];
1052: times[0] = selfPerfInfo.time;
1053: times[1] = otherPerfInfo.time;
1055: MPIU_Allreduce(times,maxTimes,2,MPIU_PETSCLOGDOUBLE,MPI_MAX,comm);
1056: children[nChildren+0].id = -1;
1057: children[nChildren+0].val = maxTimes[0];
1058: children[nChildren+1].id = -2;
1059: children[nChildren+1].val = maxTimes[1];
1060: return 0;
1061: }
1063: static PetscErrorCode PetscLogNestedTreePrint(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, int iStart, PetscLogDouble totalTime)
1064: {
1065: int depth = tree[iStart].depth;
1066: const char *name;
1067: int parentCount=1, nChildren;
1068: PetscSortItem *children;
1069: PetscStageLog stageLog;
1070: PetscEventRegInfo *eventRegInfo;
1071: PetscEventPerfInfo myPerfInfo={0},selfPerfInfo={0},otherPerfInfo={0};
1072: PetscLogDouble countsPerCall=0;
1073: PetscBool wasPrinted;
1074: PetscBool childWasPrinted;
1075: MPI_Comm comm;
1077: /* Look up the name of the event and its PerfInfo */
1078: PetscLogGetStageLog(&stageLog);
1079: eventRegInfo = stageLog->eventLog->eventInfo;
1080: name = eventRegInfo[(PetscLogEvent)tree[iStart].nstEvent].name;
1081: PetscObjectGetComm((PetscObject)viewer,&comm);
1083: PetscLogNestedTreeGetChildrenCount(tree,nTimers,iStart,depth,&nChildren);
1084: PetscLogNestedTreeSetChildrenSortItems(viewer,tree,nTimers,iStart,depth,nChildren,&children);
1085: PetscLogNestedTreeSetSelfOtherPerfInfo(tree,iStart,totalTime,children,nChildren,&myPerfInfo,&selfPerfInfo,&otherPerfInfo,&parentCount,&countsPerCall);
1087: /* Main output for this timer */
1088: PetscLogNestedTreePrintLine(viewer, myPerfInfo, countsPerCall, parentCount, depth, name, totalTime, &wasPrinted);
1090: /* Now print the lines for the children */
1091: if (nChildren > 0) {
1092: int i;
1094: /* Calculate max-times for 'self' and 'other' */
1095: PetscLogNestedTreeSetMaxTimes(comm,nChildren,selfPerfInfo,otherPerfInfo,children);
1097: /* Now sort the children (including 'self' and 'other') on total time */
1098: qsort(children, nChildren+2, sizeof(PetscSortItem), compareSortItems);
1100: /* Print (or ignore) the children in ascending order of total time */
1101: PetscViewerXMLStartSection(viewer,"events", NULL);
1102: for (i=0; i<nChildren+2; i++) {
1103: if ((children[i].val/totalTime) < THRESHOLD) {
1104: /* ignored: no output */
1105: } else if (children[i].id==-1) {
1106: PetscLogNestedTreePrintLine(viewer, selfPerfInfo, 1, parentCount, depth+1, "self", totalTime, &childWasPrinted);
1107: if (childWasPrinted) {
1108: PetscViewerXMLEndSection(viewer,"event");
1109: }
1110: } else if (children[i].id==-2) {
1111: size_t len;
1112: char *otherName;
1114: PetscStrlen(name,&len);
1115: PetscMalloc1(len+16,&otherName);
1116: PetscSNPrintf(otherName,len+16,"%s: other-timed",name);
1117: PetscLogNestedTreePrintLine(viewer, otherPerfInfo, 1, 1, depth+1, otherName, totalTime, &childWasPrinted);
1118: PetscFree(otherName);
1119: if (childWasPrinted) {
1120: PetscViewerXMLEndSection(viewer,"event");
1121: }
1122: } else {
1123: /* Print the child with a recursive call to this function */
1124: PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1125: }
1126: }
1127: PetscViewerXMLEndSection(viewer,"events");
1128: PetscFree(children);
1129: }
1131: if (wasPrinted) {
1132: PetscViewerXMLEndSection(viewer, "event");
1133: }
1134: return 0;
1135: }
1137: static PetscErrorCode PetscLogNestedTreePrintTop(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, PetscLogDouble totalTime)
1138: {
1139: int i, nChildren;
1140: PetscSortItem *children;
1141: MPI_Comm comm;
1143: PetscObjectGetComm((PetscObject)viewer,&comm);
1145: PetscLogNestedTreeGetChildrenCount(tree,nTimers,-1,0,&nChildren);
1146: PetscLogNestedTreeSetChildrenSortItems(viewer,tree,nTimers,-1,0,nChildren,&children);
1148: if (nChildren>0) {
1149: /* Now sort the children on total time */
1150: qsort(children, nChildren, sizeof(PetscSortItem), compareSortItems);
1151: /* Print (or ignore) the children in ascending order of total time */
1152: PetscViewerXMLStartSection(viewer, "timertree", "Timings tree");
1153: PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");
1154: PetscViewerXMLPutDouble(viewer, "timethreshold", NULL, thresholdTime, "%f");
1156: for (i=0; i<nChildren; i++) {
1157: if ((children[i].val/totalTime) < THRESHOLD) {
1158: /* ignored: no output */
1159: } else {
1160: /* Print the child with a recursive call to this function */
1161: PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1162: }
1163: }
1164: PetscViewerXMLEndSection(viewer, "timertree");
1165: PetscFree(children);
1166: }
1167: return 0;
1168: }
1170: typedef struct {
1171: char *name;
1172: PetscLogDouble time;
1173: PetscLogDouble flops;
1174: PetscLogDouble numMessages;
1175: PetscLogDouble messageLength;
1176: PetscLogDouble numReductions;
1177: } PetscSelfTimer;
1179: static PetscErrorCode PetscCalcSelfTime(PetscViewer viewer, PetscSelfTimer **p_self, int *p_nstMax)
1180: {
1181: const int stage = MAINSTAGE;
1182: PetscStageLog stageLog;
1183: PetscEventRegInfo *eventRegInfo;
1184: PetscEventPerfInfo *eventPerfInfo;
1185: PetscSelfTimer *selftimes;
1186: PetscSelfTimer *totaltimes;
1187: NestedEventId *nstEvents;
1188: int i, j, maxDefaultTimer;
1189: NestedEventId nst;
1190: PetscLogEvent dft;
1191: int nstMax, nstMax_local;
1192: MPI_Comm comm;
1194: PetscObjectGetComm((PetscObject)viewer,&comm);
1195: PetscLogGetStageLog(&stageLog);
1196: eventRegInfo = stageLog->eventLog->eventInfo;
1197: eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
1199: /* For each default timer, calculate the (one) nested timer that it corresponds to. */
1200: maxDefaultTimer =0;
1201: for (i=0; i<nNestedEvents; i++) {
1202: int nParents = nestedEvents[i].nParents;
1203: PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1204: for (j=0; j<nParents; j++) maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);
1205: }
1206: PetscMalloc1(maxDefaultTimer+1,&nstEvents);
1207: for (dft=0; dft<maxDefaultTimer; dft++) {nstEvents[dft] = 0;}
1208: for (i=0; i<nNestedEvents; i++) {
1209: int nParents = nestedEvents[i].nParents;
1210: NestedEventId nstEvent = nestedEvents[i].nstEvent;
1211: PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1212: for (j=0; j<nParents; j++) nstEvents[dftEvents[j]] = nstEvent;
1213: }
1215: /* Calculate largest nested event-ID */
1216: nstMax_local = 0;
1217: for (i=0; i<nNestedEvents; i++) nstMax_local = PetscMax(nestedEvents[i].nstEvent,nstMax_local);
1218: MPIU_Allreduce(&nstMax_local, &nstMax, 1, MPI_INT, MPI_MAX, comm);
1220: /* Initialize all total-times with zero */
1221: PetscMalloc1(nstMax+1,&selftimes);
1222: PetscMalloc1(nstMax+1,&totaltimes);
1223: for (nst=0; nst<=nstMax; nst++) {
1224: totaltimes[nst].time = 0;
1225: totaltimes[nst].flops = 0;
1226: totaltimes[nst].numMessages = 0;
1227: totaltimes[nst].messageLength = 0;
1228: totaltimes[nst].numReductions = 0;
1229: totaltimes[nst].name = NULL;
1230: }
1232: /* Calculate total-times */
1233: for (i=0; i<nNestedEvents; i++) {
1234: const int nParents = nestedEvents[i].nParents;
1235: const NestedEventId nstEvent = nestedEvents[i].nstEvent;
1236: const PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1237: for (j=0; j<nParents; j++) {
1238: const PetscLogEvent dftEvent = dftEvents[j];
1239: totaltimes[nstEvent].time += eventPerfInfo[dftEvent].time;
1240: totaltimes[nstEvent].flops += eventPerfInfo[dftEvent].flops;
1241: totaltimes[nstEvent].numMessages += eventPerfInfo[dftEvent].numMessages;
1242: totaltimes[nstEvent].messageLength += eventPerfInfo[dftEvent].messageLength;
1243: totaltimes[nstEvent].numReductions += eventPerfInfo[dftEvent].numReductions;
1244: }
1245: totaltimes[nstEvent].name = eventRegInfo[(PetscLogEvent)nstEvent].name;
1246: }
1248: /* Initialize: self-times := totaltimes */
1249: for (nst=0; nst<=nstMax; nst++) { selftimes[nst] = totaltimes[nst]; }
1251: /* Subtract timed subprocesses from self-times */
1252: for (i=0; i<nNestedEvents; i++) {
1253: const int nParents = nestedEvents[i].nParents;
1254: const PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1255: const NestedEventId *dftParentsSorted = nestedEvents[i].dftParentsSorted;
1256: for (j=0; j<nParents; j++) {
1257: if (dftParentsSorted[j] != DFT_ID_AWAKE) {
1258: const PetscLogEvent dftEvent = dftEvents[j];
1259: const NestedEventId nstParent = nstEvents[dftParentsSorted[j]];
1260: selftimes[nstParent].time -= eventPerfInfo[dftEvent].time;
1261: selftimes[nstParent].flops -= eventPerfInfo[dftEvent].flops;
1262: selftimes[nstParent].numMessages -= eventPerfInfo[dftEvent].numMessages;
1263: selftimes[nstParent].messageLength -= eventPerfInfo[dftEvent].messageLength;
1264: selftimes[nstParent].numReductions -= eventPerfInfo[dftEvent].numReductions;
1265: }
1266: }
1267: }
1269: PetscFree(nstEvents);
1270: PetscFree(totaltimes);
1272: /* Set outputs */
1273: *p_self = selftimes;
1274: *p_nstMax = nstMax;
1275: return 0;
1276: }
1278: static PetscErrorCode PetscPrintSelfTime(PetscViewer viewer, const PetscSelfTimer *selftimes, int nstMax, PetscLogDouble totalTime)
1279: {
1280: int i;
1281: NestedEventId nst;
1282: PetscSortItem *sortSelfTimes;
1283: PetscLogDouble *times, *maxTimes;
1284: PetscStageLog stageLog;
1285: PetscEventRegInfo *eventRegInfo;
1286: const int dum_depth = 1, dum_count=1, dum_parentcount=1;
1287: PetscBool wasPrinted;
1288: MPI_Comm comm;
1290: PetscObjectGetComm((PetscObject)viewer,&comm);
1291: PetscLogGetStageLog(&stageLog);
1292: eventRegInfo = stageLog->eventLog->eventInfo;
1294: PetscMalloc1(nstMax+1,×);
1295: PetscMalloc1(nstMax+1,&maxTimes);
1296: for (nst=0; nst<=nstMax; nst++) { times[nst] = selftimes[nst].time;}
1297: MPIU_Allreduce(times, maxTimes, nstMax+1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1298: PetscFree(times);
1300: PetscMalloc1(nstMax+1,&sortSelfTimes);
1302: /* Sort the self-timers on basis of the largest time needed */
1303: for (nst=0; nst<=nstMax; nst++) {
1304: sortSelfTimes[nst].id = nst;
1305: sortSelfTimes[nst].val = maxTimes[nst];
1306: }
1307: PetscFree(maxTimes);
1308: qsort(sortSelfTimes, nstMax+1, sizeof(PetscSortItem), compareSortItems);
1310: PetscViewerXMLStartSection(viewer, "selftimertable", "Self-timings");
1311: PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");
1313: for (i=0; i<=nstMax; i++) {
1314: if ((sortSelfTimes[i].val/totalTime) >= THRESHOLD) {
1315: NestedEventId nstEvent = sortSelfTimes[i].id;
1316: const char *name = eventRegInfo[(PetscLogEvent)nstEvent].name;
1317: PetscEventPerfInfo selfPerfInfo;
1319: selfPerfInfo.time = selftimes[nstEvent].time ;
1320: selfPerfInfo.flops = selftimes[nstEvent].flops;
1321: selfPerfInfo.numMessages = selftimes[nstEvent].numMessages;
1322: selfPerfInfo.messageLength = selftimes[nstEvent].messageLength;
1323: selfPerfInfo.numReductions = selftimes[nstEvent].numReductions;
1325: PetscLogNestedTreePrintLine(viewer, selfPerfInfo, dum_count, dum_parentcount, dum_depth, name, totalTime, &wasPrinted);
1326: if (wasPrinted) {
1327: PetscViewerXMLEndSection(viewer, "event");
1328: }
1329: }
1330: }
1331: PetscViewerXMLEndSection(viewer, "selftimertable");
1332: PetscFree(sortSelfTimes);
1333: return 0;
1334: }
1336: PetscErrorCode PetscLogView_Nested(PetscViewer viewer)
1337: {
1338: PetscLogDouble locTotalTime, globTotalTime;
1339: PetscNestedEventTree *tree = NULL;
1340: PetscSelfTimer *selftimers = NULL;
1341: int nTimers = 0, nstMax = 0;
1342: MPI_Comm comm;
1344: PetscObjectGetComm((PetscObject)viewer,&comm);
1345: PetscViewerInitASCII_XML(viewer);
1346: PetscViewerASCIIPrintf(viewer, "<!-- PETSc Performance Summary: -->\n");
1347: PetscViewerXMLStartSection(viewer, "petscroot", NULL);
1349: /* Get the total elapsed time, local and global maximum */
1350: PetscTime(&locTotalTime); locTotalTime -= petsc_BaseTime;
1351: MPIU_Allreduce(&locTotalTime, &globTotalTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1353: /* Print global information about this run */
1354: PetscPrintExeSpecs(viewer);
1355: PetscPrintGlobalPerformance(viewer, locTotalTime);
1356: /* Collect nested timer tree info from all processes */
1357: PetscLogNestedTreeCreate(viewer, &tree, &nTimers);
1358: PetscLogNestedTreePrintTop(viewer, tree, nTimers, globTotalTime);
1359: PetscLogNestedTreeDestroy(tree, nTimers);
1361: /* Calculate self-time for all (not-nested) events */
1362: PetscCalcSelfTime(viewer, &selftimers, &nstMax);
1363: PetscPrintSelfTime(viewer, selftimers, nstMax, globTotalTime);
1364: PetscFree(selftimers);
1366: PetscViewerXMLEndSection(viewer, "petscroot");
1367: PetscViewerFinalASCII_XML(viewer);
1368: return 0;
1369: }
1371: /*
1372: * Get the name of a nested event.
1373: */
1374: static PetscErrorCode PetscGetNestedEventName(const PetscNestedEventTree *tree,int id,char **name)
1375: {
1376: PetscStageLog stageLog;
1378: PetscLogGetStageLog(&stageLog);
1379: *name = stageLog->eventLog->eventInfo[(PetscLogEvent)tree[id].nstEvent].name;
1380: return 0;
1381: }
1383: /*
1384: * Get the total time elapsed.
1385: */
1386: static PetscErrorCode PetscGetTotalTime(const PetscViewer viewer,PetscLogDouble *totalTime)
1387: {
1388: PetscLogDouble locTotalTime;
1389: MPI_Comm comm;
1391: PetscObjectGetComm((PetscObject)viewer,&comm);
1392: PetscTime(&locTotalTime);
1393: locTotalTime -= petsc_BaseTime;
1394: MPIU_Allreduce(&locTotalTime,totalTime,1,MPIU_PETSCLOGDOUBLE,MPI_MAX,comm);
1395: return 0;
1396: }
1398: /*
1399: * Write a line to the flame graph output and then recurse into child events.
1400: */
1401: static PetscErrorCode PetscLogNestedTreePrintFlamegraph(PetscViewer viewer,PetscNestedEventTree *tree,int nTimers,int iStart,PetscLogDouble totalTime,PetscIntStack eventStack)
1402: {
1403: int depth=tree[iStart].depth,parentCount=1,i,nChildren;
1404: char *name=NULL;
1405: PetscEventPerfInfo myPerfInfo={0},selfPerfInfo={0},otherPerfInfo={0};
1406: PetscLogDouble countsPerCall=0,locTime,globTime;
1407: PetscSortItem *children;
1408: PetscStageLog stageLog;
1409: MPI_Comm comm;
1411: PetscLogGetStageLog(&stageLog);
1412: PetscObjectGetComm((PetscObject)viewer,&comm);
1414: /* Determine information about the child events as well as 'self' and 'other' */
1415: PetscLogNestedTreeGetChildrenCount(tree,nTimers,iStart,depth,&nChildren);
1416: PetscLogNestedTreeSetChildrenSortItems(viewer,tree,nTimers,iStart,depth,nChildren,&children);
1417: PetscLogNestedTreeSetSelfOtherPerfInfo(tree,iStart,totalTime,children,nChildren,&myPerfInfo,&selfPerfInfo,&otherPerfInfo,&parentCount,&countsPerCall);
1419: /* Write line to the file. The time shown is 'self' + 'other' because each entry in the output
1420: * is the total time spent in the event minus the amount spent in child events. */
1421: locTime = selfPerfInfo.time + otherPerfInfo.time;
1422: MPIU_Allreduce(&locTime,&globTime,1,MPIU_PETSCLOGDOUBLE,MPI_MAX,comm);
1423: if (globTime/totalTime > THRESHOLD && tree[iStart].own) {
1424: /* Iterate over parent events in the stack and write them */
1425: for (i=0; i<=eventStack->top; i++) {
1426: PetscGetNestedEventName(tree,eventStack->stack[i],&name);
1427: PetscViewerASCIIPrintf(viewer,"%s;",name);
1428: }
1429: PetscGetNestedEventName(tree,iStart,&name);
1430: /* The output is given as an integer in microseconds because otherwise the file cannot be read
1431: * by apps such as speedscope (https://speedscope.app/). */
1432: PetscViewerASCIIPrintf(viewer,"%s %" PetscInt64_FMT "\n",name,(PetscInt64)(globTime*1e6));
1433: }
1435: /* Add the current event to the parent stack and write the child events */
1436: PetscIntStackPush(eventStack, iStart);
1437: for (i=0; i<nChildren; i++) {
1438: PetscLogNestedTreePrintFlamegraph(viewer,tree,nTimers,children[i].id,totalTime,eventStack);
1439: }
1440: /* Pop the top item from the stack and immediately discard it */
1441: {
1442: int tmp;
1443: PetscIntStackPop(eventStack, &tmp);
1444: }
1445: return 0;
1446: }
1448: /*
1449: * Print nested logging information to a file suitable for reading into a Flame Graph.
1450: *
1451: * The format consists of a semicolon-separated list of events and the event duration in microseconds (which must be an integer).
1452: * An example output would look like:
1453: * MatAssemblyBegin 1
1454: * MatAssemblyEnd 10
1455: * MatView 302
1456: * KSPSetUp 98
1457: * KSPSetUp;VecSet 5
1458: * KSPSolve 150
1459: *
1460: * This option may be requested from the command line by passing in the flag `-log_view :<somefile>.txt:ascii_flamegraph`.
1461: */
1462: PetscErrorCode PetscLogView_Flamegraph(PetscViewer viewer)
1463: {
1464: int nTimers=0,i,nChildren;
1465: PetscIntStack eventStack;
1466: PetscLogDouble totalTime;
1467: PetscNestedEventTree *tree=NULL;
1468: PetscSortItem *children;
1470: PetscGetTotalTime(viewer,&totalTime);
1471: PetscLogNestedTreeCreate(viewer, &tree, &nTimers);
1472: /* We use an integer stack to keep track of parent event IDs */
1473: PetscIntStackCreate(&eventStack);
1475: /* Initialize the child events and write them recursively */
1476: PetscLogNestedTreeGetChildrenCount(tree,nTimers,-1,0,&nChildren);
1477: PetscLogNestedTreeSetChildrenSortItems(viewer,tree,nTimers,-1,0,nChildren,&children);
1478: for (i=0; i<nChildren; i++) {
1479: PetscLogNestedTreePrintFlamegraph(viewer,tree,nTimers,children[i].id,totalTime,eventStack);
1480: }
1482: PetscLogNestedTreeDestroy(tree, nTimers);
1483: PetscIntStackDestroy(eventStack);
1484: return 0;
1485: }
1487: PETSC_EXTERN PetscErrorCode PetscASend(int count, int datatype)
1488: {
1489: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1490: #endif
1492: petsc_send_ct++;
1493: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1494: PetscMPITypeSize(count,MPI_Type_f2c((MPI_Fint) datatype),&petsc_send_len);
1495: #endif
1496: return 0;
1497: }
1499: PETSC_EXTERN PetscErrorCode PetscARecv(int count, int datatype)
1500: {
1501: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1502: #endif
1504: petsc_recv_ct++;
1505: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1506: PetscMPITypeSize(count,MPI_Type_f2c((MPI_Fint) datatype),&petsc_recv_len);
1507: #endif
1508: return 0;
1509: }
1511: PETSC_EXTERN PetscErrorCode PetscAReduce()
1512: {
1513: petsc_allreduce_ct++;
1514: return 0;
1515: }
1517: #endif