CatgirlIntelligenceAgency/run/test-data/url--425233170

2037 lines
238 KiB
Plaintext
Raw Normal View History

<!doctype html>
<html class="ltr" dir="ltr" lang="en-US">
<head>
<meta name="google-site-verification" content="8qCO_zZVZwiC44uEPeBCFNAKZdRxIXMhuAYJFJBpdL0">
<title>Stampede2 User Guide - TACC User Portal</title>
<script src="https://portal.tacc.utexas.edu/portal-theme/js/jquery.min.js" type="text/javascript"></script>
<script src="https://portal.tacc.utexas.edu/portal-theme/js/jquery-ui.min.js" type="text/javascript"></script>
<script src="https://portal.tacc.utexas.edu/portal-theme/js/jquery.dataTables.min.js" type="text/javascript"></script>
<script src="https://portal.tacc.utexas.edu/portal-theme/js/jquery.tablesorter.min.js" type="text/javascript"></script>
<script src="https://portal.tacc.utexas.edu/portal-theme/js/jquery.doTimeout.min.js" type="text/javascript"></script>
<script src="https://portal.tacc.utexas.edu/portal-theme/js/jquery.toc.min.js" type="text/javascript"></script>
<script src="https://portal.tacc.utexas.edu/portal-theme/js/navbar.js" type="text/javascript"></script>
<script src="https://portal.tacc.utexas.edu/portal-theme/js/bootstrap.js" type="text/javascript"></script>
<script src="https://portal.tacc.utexas.edu/portal-theme/js/auto-enable-toc.js" type="text/javascript"></script>
<meta content="text/html; charset=UTF-8" http-equiv="content-type">
<link href="https://portal.tacc.utexas.edu/portal-theme/images/favicon.ico" rel="Shortcut Icon">
<link href="https://portal.tacc.utexas.edu/user-guides/stampede2" rel="canonical">
<link href="/html/css/main.css?browserId=other&amp;themeId=portal_WAR_portaltheme&amp;minifierType=css&amp;languageId=en_US&amp;b=6120&amp;t=1379449732000" rel="stylesheet" type="text/css">
<link href="/html/portlet/journal_content/css/main.css?browserId=other&amp;themeId=portal_WAR_portaltheme&amp;minifierType=css&amp;languageId=en_US&amp;b=6120&amp;t=1614359417000" rel="stylesheet" type="text/css">
<script type="text/javascript">/*<![CDATA[*/var Liferay={Browser:{acceptsGzip:function(){return true},getMajorVersion:function(){return 79},getRevision:function(){return"537.36"},getVersion:function(){return"79.0.3945.130"},isAir:function(){return false},isChrome:function(){return true},isFirefox:function(){return false},isGecko:function(){return true},isIe:function(){return false},isIphone:function(){return false},isLinux:function(){return false},isMac:function(){return false},isMobile:function(){return false},isMozilla:function(){return false},isOpera:function(){return false},isRtf:function(){return true},isSafari:function(){return true},isSun:function(){return false},isWap:function(){return false},isWapXhtml:function(){return false},isWebKit:function(){return true},isWindows:function(){return true},isWml:function(){return false}},Data:{isCustomizationView:function(){return false},notices:[null]},ThemeDisplay:{getCDNDynamicResourcesHost:function(){return""},getCDNHost:function(){return""},getCompanyId:function(){return"10132"},getCompanyGroupId:function(){return"10165"},getUserId:function(){return"10135"},getDoAsUserIdEncoded:function(){return""},getPlid:function(){return"1475721"},getLayoutId:function(){return"164"},getLayoutURL:function(){return"https://portal.tacc.utexas.edu/user-guides/stampede2"},isPrivateLayout:function(){return"false"},getParentLayoutId:function(){return"17"},getScopeGroupId:function(){return"10157"},getScopeGroupIdOrLiveGroupId:function(){return"10157"},getParentGroupId:function(){return"10157"},isImpersonated:function(){return false},isSignedIn:function(){return false},getDefaultLanguageId:function(){return"en_US"},getLanguageId:function(){return"en_US"},isAddSessionIdToURL:function(){return false},isFreeformLayout:function(){return false},isStateExclusive:function(){return false},isStateMaximized:function(){return false},isStatePopUp:function(){return false},getPathContext:function(){return""},getPathImage:function(){return"/image"},getPathJavaScript:function(){return"/html/js"},getPathMain:function(){return"/c"},getPathThemeImages:function(){return"https://portal.tacc.utexas.edu/portal-theme/images"},getPathThemeRoot:function(){return"/portal-theme/"},getURLControlPanel:function(){return"/group/control_panel?doAsGroupId=10157&refererPlid=1475721"},getURLHome:function(){return"https://portal.tacc.utexas.edu/web/tup/home"},getSessionId:function(){return"AF04850B586BFC725A1DA07C09C26690"},getPortletSetupShowBordersDefault:function(){return false}},PropsValues:{NTLM_AUTH_ENABLED:false}};var themeDisplay=Liferay.ThemeDisplay;Liferay.AUI={getBaseURL:function(){return"https://portal.tacc.utexas.edu/html/js/aui/"},getCombine:function(){return true},getComboPath:function(){return"/combo/?browserId=other&minifierType=&languageId=en_US&b=6120&t=1614889185000&p=/html/js&"},getFilter:function(){return{replaceStr:function(c,b,a){return b+"m="+(c.split("/html/js")[1]||"")},searchExp:"(\\?|&)/([^&]+)"}},getJavaScriptRootPath:function(){return"/html/js"},getLangPath:function(){return"aui_lang.jsp?browserId=other&themeId=portal_WAR_portaltheme&colorSchemeId=01&minifierType=js&languageId=en_US&b=6120&t=1614889185000"},getRootPath:function(){return"/html/js/aui/"}};window.YUI_config={base:Liferay.AUI.getBaseURL(),comboBase:Liferay.AUI.getComboPath(),fetchCSS:true,filter:Liferay.AUI.getFilter(),root:Liferay.AUI.getRootPath(),useBrowserConsole:false};Liferay.authToken="EH0RlFi6";Liferay.currentURL="\x2fuser-guides\x2fstampede2\x3bjsessionid\x3dAF04850B586BFC725A1DA07C09C26690";Liferay.currentURLEncoded="%2Fuser-guides%2Fstampede2%3Bjsessionid%3DAF04850B586BFC725A1DA07C09C26690";/*]]>*/</script>
<script src="/html/js/everything.jsp?browserId=other&amp;themeId=portal_WAR_portaltheme&amp;colorSchemeId=01&amp;minifierType=js&amp;minifierBundleId=javascript.everything.files&amp;languageId=en_US&amp;b=6120&amp;t=1614889185000" type="text/javascript"></script>
<script type="text/javascript">/*<![CDATA[*/Liferay.Portlet.list=["1_WAR_kaleodesignerportlet","56_INSTANCE_RYXn3pn9Wi4j"];/*]]>*/</script>
<script type="text/javascript">/*<![CDATA[*/var _gaq=_gaq||[];_gaq.push(["_setAccount","UA-125525035-7"]);_gaq.push(["_trackPageview"]);(function(){var a=document.createElement("script");a.src=("https:"==document.location.protocol?"https://ssl":"http://www")+".google-analytics.com/ga.js";a.setAttribute("async","true");document.documentElement.firstChild.appendChild(a)})();/*]]>*/</script>
<link class="lfr-css-file" href="https://portal.tacc.utexas.edu/portal-theme/css/main.css?browserId=other&amp;themeId=portal_WAR_portaltheme&amp;minifierType=css&amp;languageId=en_US&amp;b=6120&amp;t=1615492875000" rel="stylesheet" type="text/css">
<style type="text/css">.user-summary-portlet .label{background-color:transparent;color:black;text-shadow:none;padding:0;display:inline}nav ul.primary li.documentation ul.secondary{width:800px;margin-left:-300px}.signed-in .accountrequest,.signed-in .passwordreset{display:none}h4,h5,h6{margin:.8em 0}</style>
<style type="text/css">.job-script{background-color:#efd5bf;color:black}.cmd-line{background-color:#e4edf6;color:black}.toc-h1{font-size:1.1em}.toc-h2{font-size:1.em}.toc-h3{font-size:.9em}</style>
</head>
<body class=" yui3-skin-sam controls-visible guest-site signed-out public-page site">
<div id="wrapper"> <a href="#main-content" id="skip-to-content">Skip to Content</a>
<header id="banner" role="banner">
<hgroup id="heading">
<h1 class="company-title"> <a href="/" title="Go to TACC User Portal"> <img src="https://portal.tacc.utexas.edu/portal-theme/images/TACC_Logo_vector.png" width="218" height="65" alt="TACC"> User Portal </a> </h1>
<h2 class="community-title"> <a href="https://portal.tacc.utexas.edu/user-guides/stampede2?p_p_auth=3zGDtOgQ&amp;p_p_id=49&amp;p_p_lifecycle=1&amp;p_p_state=normal&amp;p_p_mode=view&amp;_49_struts_action=%2Fmy_sites%2Fview&amp;_49_groupId=10157&amp;_49_privateLayout=false" title="Go to TACC User Portal"> <span>TACC User Portal</span> </a> </h2>
<h3 class="page-title"> <span>Stampede2 User Guide</span> </h3>
</hgroup> <a href="/c/portal/login?p_l_id=1475721" id="sign-in" rel="nofollow">Sign In</a>
<nav>
<ul class="primary">
<li class="home"> <a href="https://portal.tacc.utexas.edu/home"> <span> Home</span> </a>
<ul class="secondary">
<li class="accountrequest"> <a href="https://portal.tacc.utexas.edu/account-request"> <span> Account Request</span> </a> </li>
<li class="passwordreset"> <a href="https://portal.tacc.utexas.edu/password-reset"> <span> Password Reset</span> </a> </li>
<li class="newuserinformation"> <a href="https://portal.tacc.utexas.edu/new-user-information"> <span> New User Information</span> </a> </li>
<li class="accountprofile"> <a href="https://portal.tacc.utexas.edu/account-profile"> <span> Account Profile</span> </a> </li>
</ul> </li>
<li class="news"> <a href="https://portal.tacc.utexas.edu/news"> <span> News</span> </a>
<ul class="secondary">
<li class="usernews"> <a href="https://portal.tacc.utexas.edu/user-news"> <span> User News</span> </a> </li>
<li class="subscribe"> <a href="https://portal.tacc.utexas.edu/news/subscribe"> <span> Subscribe</span> </a> </li>
</ul> </li>
<li class="resources"> <a href="https://portal.tacc.utexas.edu/resources"> <span> Resources</span> </a>
<ul class="secondary">
<li class="systemsmonitor"> <a href="https://portal.tacc.utexas.edu/system-monitor"> <span> Systems Monitor</span> </a> </li>
<li class="softwaresearch"> <a href="https://portal.tacc.utexas.edu/resources/software-search"> <span> Software Search</span> </a> </li>
<li class="taccvisualizationportal"> <a href="https://vis.tacc.utexas.edu/"> <span> TACC Visualization Portal</span> </a> </li>
<li class="vislabreservations"> <a href="https://portal.tacc.utexas.edu/vislab-reservations"> <span> Vislab Reservations</span> </a> </li>
</ul> </li>
<li class="allocations"> <a href="https://portal.tacc.utexas.edu/allocations"> <span> Allocations</span> </a>
<ul class="secondary">
<li class="allocationsoverview"> <a href="https://portal.tacc.utexas.edu/allocations-overview"> <span> Allocations Overview</span> </a> </li>
<li class="projectsandallocations"> <a href="https://portal.tacc.utexas.edu/projects-and-allocations"> <span> Projects and Allocations</span> </a> </li>
<li class="managingallocations"> <a href="https://portal.tacc.utexas.edu/tutorials/managing-allocations"> <span> Managing Allocations</span> </a> </li>
</ul> </li>
<li class="documentation current"> <a href="https://portal.tacc.utexas.edu/documentation"> <span> Documentation</span> </a>
<ul class="secondary">
<li class="userguides"> <a href="https://portal.tacc.utexas.edu/user-guides"> <span> User Guides</span> </a>
<ul class="tertiary">
<li class="chameleon"> <a href="https://www.chameleoncloud.org/docs/user-guides/"> <span> Chameleon</span> </a> </li>
<li class="corral"> <a href="https://portal.tacc.utexas.edu/user-guides/corral"> <span> Corral</span> </a> </li>
<li class="frontera"> <a href="https://fronteraweb.tacc.utexas.edu/user-guide/"> <span> Frontera</span> </a> </li>
<li class="lonestar"> <a href="https://portal.tacc.utexas.edu/user-guides/lonestar5"> <span> Lonestar 5</span> </a> </li>
<li class="longhorn"> <a href="https://portal.tacc.utexas.edu/user-guides/longhorn"> <span> Longhorn</span> </a> </li>
<li class="maverick"> <a href="https://portal.tacc.utexas.edu/user-guides/maverick2"> <span> Maverick2</span> </a> </li>
<li class="ranch"> <a href="https://portal.tacc.utexas.edu/user-guides/ranch"> <span> Ranch</span> </a> </li>
<li class="stallion"> <a href="https://portal.tacc.utexas.edu/user-guides/stallion"> <span> Stallion</span> </a> </li>
<li class="stampede current"> <a href="https://portal.tacc.utexas.edu/user-guides/stampede2"> <span> Stampede2</span> </a> </li>
</ul> </li>
<li class="software"> <a href="https://portal.tacc.utexas.edu/software"> <span> Software</span> </a>
<ul class="tertiary">
<li class="abaqus"> <a href="https://portal.tacc.utexas.edu/software/abaqus"> <span> ABAQUS</span> </a> </li>
<li class="ansys"> <a href="https://portal.tacc.utexas.edu/software/ansys"> <span> ANSYS</span> </a> </li>
<li class="caffe"> <a href="https://portal.tacc.utexas.edu/software/caffe"> <span> Caffe</span> </a> </li>
<li class="gaussian"> <a href="https://portal.tacc.utexas.edu/software/gaussian"> <span> Gaussian</span> </a> </li>
<li class="gromacs"> <a href="https://portal.tacc.utexas.edu/software/gromacs"> <span> GROMACS</span> </a> </li>
<li class="idev"> <a href="https://portal.tacc.utexas.edu/software/idev"> <span> idev</span> </a> </li>
<li class="irods"> <a href="https://portal.tacc.utexas.edu/software/irods"> <span> IRODS</span> </a> </li>
<li class="lammps"> <a href="https://portal.tacc.utexas.edu/software/lammps"> <span> LAMMPS</span> </a> </li>
<li class="matlab"> <a href="https://portal.tacc.utexas.edu/software/matlab"> <span> MATLAB</span> </a> </li>
<li class="modules"> <a href="https://portal.tacc.utexas.edu/software/modules"> <span> Modules</span> </a> </li>
<li class="namd"> <a href="https://portal.tacc.utexas.edu/software/namd"> <span> NAMD</span> </a> </li>
<li class="openfoam"> <a href="https://portal.tacc.utexas.edu/software/openfoam"> <span> OpenFOAM</span> </a> </li>
<li class="quantumespresso"> <a href="https://portal.tacc.utexas.edu/software/qe"> <span> Quantum Espresso</span> </a> </li>
<li class="remora"> <a href="https://portal.tacc.utexas.edu/software/remora"> <span> REMORA</span> </a> </li>
<li class="tau"> <a href="https://portal.tacc.utexas.edu/software/tau"> <span> TAU</span> </a> </li>
<li class="tensorflow"> <a href="https://portal.tacc.utexas.edu/software/tensorflow"> <span> Tensorflow</span> </a> </li>
<li class="vasp"> <a href="https://portal.tacc.utexas.edu/software/vasp"> <span> VASP</span> </a> </li>
</ul> </li>
<li class="tutorials"> <a href="https://portal.tacc.utexas.edu/tutorials"> <span> Tutorials</span> </a>
<ul class="tertiary">
<li class="accesscontrollists"> <a href="https://portal.tacc.utexas.edu/tutorials/acls"> <span> Access Control Lists</span> </a> </li>
<li class="bashquickstartguide"> <a href="https://portal.tacc.utexas.edu/tutorials/bashquickstart"> <span> Bash Quick Start Guide</span> </a> </li>
<li class="blaslapackattacc"> <a href="https://portal.tacc.utexas.edu/tutorials/blaslapack"> <span> BLAS/LAPACK at TACC</span> </a> </li>
<li class="ddtdebugger"> <a href="https://portal.tacc.utexas.edu/tutorials/ddt"> <span> DDT Debugger</span> </a> </li>
<li class="globusattacc"> <a href="https://portal.tacc.utexas.edu/tutorials/globus"> <span> Globus at TACC</span> </a> </li>
<li class="managingio"> <a href="https://portal.tacc.utexas.edu/tutorials/managingio"> <span> Managing I/O</span> </a> </li>
<li class="mapprofiler"> <a href="https://portal.tacc.utexas.edu/tutorials/map"> <span> MAP Profiler</span> </a> </li>
<li class="multifactorauthentication"> <a href="https://portal.tacc.utexas.edu/tutorials/multifactor-authentication"> <span> Multi-factor Authentication</span> </a> </li>
<li class="remotedesktopaccess"> <a href="https://portal.tacc.utexas.edu/tutorials/remote-desktop-access"> <span> Remote Desktop Access</span> </a> </li>
<li class="sharingprojectfiles"> <a href="https://portal.tacc.utexas.edu/tutorials/sharing-project-files"> <span> Sharing Project Files</span> </a> </li>
<li class="stockyardworkmigration"> <a href="https://portal.tacc.utexas.edu/tutorials/stockyard-work-migration"> <span> Stockyard /work Migration</span> </a> </li>
</ul> </li>
</ul> </li>
<li class="training"> <a href="https://portal.tacc.utexas.edu/training"> <span> Training</span> </a>
<ul class="secondary">
<li class="upcomingtraining"> <a href="https://learn.tacc.utexas.edu/"> <span> Upcoming Training</span> </a> </li>
<li class="taccinstitutes"> <a href="https://www.tacc.utexas.edu/education/institutes"> <span> TACC Institutes</span> </a> </li>
</ul> </li>
<li class="consulting"> <a href="https://portal.tacc.utexas.edu/consulting"> <span> Consulting</span> </a>
<ul class="secondary">
<li class="taccconsulting"> <a href="https://portal.tacc.utexas.edu/tacc-consulting"> <span> TACC Consulting</span> </a> </li>
<li class="servicesoverview"> <a href="https://portal.tacc.utexas.edu/consulting/overview"> <span> Services Overview</span> </a> </li>
</ul> </li>
<li class="about"> <a href="https://portal.tacc.utexas.edu/about"> <span> About</span> </a>
<ul class="secondary">
<li class="contact"> <a href="https://portal.tacc.utexas.edu/contact"> <span> Contact</span> </a> </li>
<li class="feedback"> <a href="https://portal.tacc.utexas.edu/feedback"> <span> Feedback</span> </a> </li>
<li class="taccwebsite"> <a href="https://portal.tacc.utexas.edu/web/website" target="_blank"> <span> TACC Website</span> </a> </li>
<li class="taccusagepolicy"> <a href="https://portal.tacc.utexas.edu/tacc-usage-policy"> <span> TACC Usage Policy</span> </a> </li>
<li class="tacccitation"> <a href="https://portal.tacc.utexas.edu/tacc-citation"> <span> TACC Citation</span> </a> </li>
</ul> </li>
</ul>
</nav>
</header>
<div id="content">
<div class="portlet-boundary portlet-boundary_103_ portlet-static portlet-static-end portlet-borderless " id="p_p_id_103_"> <span id="p_103"></span>
<div class="portlet-body">
</div>
</div>
<div class="columns-1" id="main-content" role="main">
<div class="portlet-layout">
<div class="portlet-column portlet-column-only" id="column-1">
<div class="portlet-dropzone portlet-column-content portlet-column-content-only" id="layout-column_column-1">
<div class="portlet-boundary portlet-boundary_56_ portlet-static portlet-static-end portlet-borderless portlet-journal-content enable-toc" id="p_p_id_56_INSTANCE_RYXn3pn9Wi4j_"> <span id="p_56_INSTANCE_RYXn3pn9Wi4j"></span>
<div class="portlet-body">
<div class="portlet-borderless-container" style="">
<div class="portlet-body">
<div class="journal-content-article" id="article_10132_10157_1476421_18.9">
<script type="text/javascript">/*<![CDATA[*/function showhideknlserial(){var a=document.getElementById("knlserial");if(a.style.display=="block"){document.getElementById("img-knlserial").src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000";a.style.display="none"}else{a.style.display="block";document.getElementById("img-knlserial").src="/documents/10157/0/small-down-arrow.png/05df5954-1484-4c56-8e54-1c2c0eb501dc?t=1480439900000"}}function showhideskxserial(){var a=document.getElementById("skxserial");if(a.style.display=="block"){document.getElementById("img-skxserial").src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000";a.style.display="none"}else{a.style.display="block";document.getElementById("img-skxserial").src="/documents/10157/0/small-down-arrow.png/05df5954-1484-4c56-8e54-1c2c0eb501dc?t=1480439900000"}}function showhideknlmpi(){var a=document.getElementById("knlmpi");if(a.style.display=="block"){document.getElementById("img-knlmpi").src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000";a.style.display="none"}else{a.style.display="block";document.getElementById("img-knlmpi").src="/documents/10157/0/small-down-arrow.png/05df5954-1484-4c56-8e54-1c2c0eb501dc?t=1480439900000"}}function showhideskxmpi(){var a=document.getElementById("skxmpi");if(a.style.display=="block"){document.getElementById("img-skxmpi").src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000";a.style.display="none"}else{a.style.display="block";document.getElementById("img-skxmpi").src="/documents/10157/0/small-down-arrow.png/05df5954-1484-4c56-8e54-1c2c0eb501dc?t=1480439900000"}}function showhideknlopenmp(){var a=document.getElementById("knlopenmp");if(a.style.display=="block"){document.getElementById("img-knlopenmp").src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000";a.style.display="none"}else{a.style.display="block";document.getElementById("img-knlopenmp").src="/documents/10157/0/small-down-arrow.png/05df5954-1484-4c56-8e54-1c2c0eb501dc?t=1480439900000"}}function showhideskxopenmp(){var a=document.getElementById("skxopenmp");if(a.style.display=="block"){document.getElementById("img-skxopenmp").src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000";a.style.display="none"}else{a.style.display="block";document.getElementById("img-skxopenmp").src="/documents/10157/0/small-down-arrow.png/05df5954-1484-4c56-8e54-1c2c0eb501dc?t=1480439900000"}}function showhideknlhybrid(){var a=document.getElementById("knlhybrid");if(a.style.display=="block"){document.getElementById("img-knlhybrid").src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000";a.style.display="none"}else{a.style.display="block";document.getElementById("img-knlhybrid").src="/documents/10157/0/small-down-arrow.png/05df5954-1484-4c56-8e54-1c2c0eb501dc?t=1480439900000"}}function showhideskxhybrid(){var a=document.getElementById("skxhybrid");if(a.style.display=="block"){document.getElementById("img-skxhybrid").src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000";a.style.display="none"}else{a.style.display="block";document.getElementById("img-skxhybrid").src="/documents/10157/0/small-down-arrow.png/05df5954-1484-4c56-8e54-1c2c0eb501dc?t=1480439900000"}};/*]]>*/</script>
<style>.help{box-sizing:border-box}.help *,.help *:before,.help *:after{box-sizing:inherit}.row{margin-bottom:10px;margin-left:-15px;margin-right:-15px}.row:before,.row:after{content:" ";display:table}.row:after{clear:both}[class*="col-"]{box-sizing:border-box;float:left;position:relative;min-height:1px;padding-left:15px;padding-right:15px}.col-1-5{width:20%}.col-2-5{width:40%}.col-3-5{width:60%}.col-4-5{width:80%}.col-1-4{width:25%}.col-1-3{width:33.3%}.col-1-2,.col-2-4{width:50%}.col-2-3{width:66.7%}.col-3-4{width:75%}.col-1-1{width:100%}article.help{font-size:1.25em;line-height:1.2em}.text-center{text-align:center}figure{display:block;margin-bottom:20px;line-height:1.42857143;border:1px solid #ddd;border-radius:4px;padding:4px;text-align:center}figcaption{font-weight:bold}.lead{font-size:1.7em;line-height:1.4;font-weight:300}.embed-responsive{position:relative;display:block;height:0;padding:0;overflow:hidden}.embed-responsive-16by9{padding-bottom:56.25%}.embed-responsive .embed-responsive-item,.embed-responsive embed,.embed-responsive iframe,.embed-responsive object,.embed-responsive video{position:absolute;top:0;bottom:0;left:0;width:100%;height:100%;border:0}</style> <span style="font-size:225%; font-weight:bold;">Stampede2 User Guide</span>
<br> <i>Last update: April 9, 2021</i> <span style="font-size:90%;">see <a href="#history">revision history</a></span>
<div id="notices">
<h1 id="notices"><a href="#notices">Notices</a></h1>
<ul>
<li><strong>All users: read <a href="http://portal.tacc.utexas.edu/tutorials/managingio">Managing I/O on TACC Resources</a>.</strong> TACC Staff have put forth new file system and job submission guidelines. (01/09/20)</li>
<li><strong>The Intel 18 compiler has replaced Intel 17 as the default compiler on Stampede2.</strong> The Intel 17 compiler and software stack are still available to those who load the appropriate modules explicitly. See <a href="https://portal.tacc.utexas.edu/user-guides/stampede2/intel">Intel 18 to Become New Default Compiler on Stampede2</a> for more information. (02/26/19)</li>
<li><strong>In order to balance queue wait times, the charge rate for all <a href="#queues">KNL queues</a> has been adjusted to 0.8 SUs per node-hour.</strong> The charge rate for the SKX queues remains at 1 SU. (01/14/19)</li>
<li><strong>Stampede2's Knights Landing (KNL) compute nodes each have 68 cores</strong>, and each core has 4 hardware threads. But it may not be a good idea to use all 272 hardware threads simultaneously, and it's certainly not the first thing you should try. In most cases it's best to specify no more than 64-68 MPI tasks or independent processes per node, and 1-2 threads/core. See <a href="#programming-knl-bestpractices">Best Known Practices…</a> for more information.</li>
<li><strong>Stampede2's Skylake (SKX) compute nodes each have 48 cores</strong> on two sockets (24 cores/socket). Hyperthreading is enabled: there are two hardware threads per core, for a total of 48 x 2 = 96 hardware threads per node. See <a href="#table2">Table 2</a> for more information. Note that SKX nodes have their own <a href="#running-queues">queues</a>.</li>
</ul>
</div>
<div id="figure1">
<figure>
<img src="/documents/10157/1475729/S2_with_paintjob.jpg/3729b4fb-4fe8-4130-aec5-d95b1852a84b?t=1501774382263" style="width: 800px; height: 593px;">
<p></p>
<figcaption>
Figure 1. Stampede2 System
</figcaption>
</figure>
</div>
<div id="intro">
<h1 id="introduction"><a href="#intro">Introduction</a></h1>
<p>Stampede2, generously funded by the National Science Foundation (NSF) through award ACI-1134872, is one of the Texas Advanced Computing Center (TACC), University of Texas at Austin's flagship supercomputers. Stampede2 entered full production in the Fall 2017 as an 18-petaflop national resource that builds on the successes of the original Stampede system it replaces. The first phase of the Stampede2 rollout featured the second generation of processors based on Intel's Many Integrated Core (MIC) architecture. Stampede2's 4,200 Knights Landing (KNL) nodes represent a radical break with the first-generation Knights Corner (KNC) MIC coprocessor. Unlike the legacy KNC, a Stampede2 KNL is not a coprocessor: each 68-core KNL is a stand-alone, self-booting processor that is the sole processor in its node. Phase 2 added to Stampede2 a total of 1,736 Intel Xeon Skylake (SKX) nodes.</p>
</div>
<div id="overview">
<h1 id="system-overview"><a href="#overview">System Overview</a></h1>
<div id="overview-phase1computenodes">
<h2 id="knl-compute-nodes"><a href="#overview-phase1computenodes">KNL Compute Nodes</a></h2>
<p>Stampede2 hosts 4,200 KNL compute nodes, including 504 KNL nodes that were formerly configured as a Stampede1 sub-system.</p>
<p>Each of Stampede2's KNL nodes includes 96GB of traditional DDR4 Random Access Memory (RAM). They also feature an additional 16GB of high bandwidth, on-package memory known as Multi-Channel Dynamic Random Access Memory (<strong>MCDRAM</strong>) that is up to four times faster than DDR4. The KNL's memory is configurable in two important ways: there are BIOS settings that determine at boot time the processor's <strong>memory mode</strong> and <strong>cluster mode</strong>. The processor's <strong>memory mode</strong> determines whether the fast MCDRAM operates as RAM, as direct-mapped L3 cache, or as a mixture of the two. The <strong>cluster mode</strong> determines the mechanisms for achieving cache coherency, which in turn determines latency: roughly speaking, this mode specifies the degree to which some memory addresses are "closer" to some cores than to others. See "<a href="#programming-knl">Programming and Performance: KNL</a>" below for a top-level description of these and other available memory and cluster modes.</p>
</div>
<div id="table1">
<p><a href="#table1">Table 1. Stampede2 KNL Compute Node Specifications</a></p>
<table border="1" cellpadding="3">
<tbody>
<tr>
<td align="right">Model:&nbsp;</td>
<td>Intel Xeon Phi 7250 ("Knights Landing")</td>
</tr>
<tr>
<td align="right" nowrap>Total cores per KNL node:&nbsp;</td>
<td>68 cores on a single socket</td>
</tr>
<tr>
<td align="right" nowrap>Hardware threads per core:&nbsp;</td>
<td>4</td>
</tr>
<tr>
<td align="right" nowrap>Hardware threads per node:&nbsp;</td>
<td>68 x 4 = 272</td>
</tr>
<tr>
<td align="right">Clock rate:&nbsp;</td>
<td>1.4GHz</td>
</tr>
<tr>
<td align="right">RAM:&nbsp;</td>
<td>96GB DDR4 plus 16GB high-speed MCDRAM. Configurable in two important ways; see "<a href="#programming-knl">Programming and Performance: KNL</a>" for more info.</td>
</tr>
<tr>
<td align="right">Cache:&nbsp;</td>
<td>32KB L1 data cache per core; 1MB L2 per two-core tile. In default config, <a href="#programming-knl-memorymodes">MCDRAM</a> operates as 16GB direct-mapped L3.</td>
</tr>
<tr>
<td align="right">Local storage:&nbsp;</td>
<td>All but 504 KNL nodes have a 107GB <code>/tmp</code> partition on a 200GB Solid State Drive (SSD). The 504 KNLs originally installed as the Stampede1 KNL sub-system each have a 32GB <code>/tmp</code> partition on 112GB SSDs. The latter nodes currently make up the <code>development</code>, <code>long</code> and <span style="white-space: nowrap;"><code>flat-quadrant</code></span> <a href="#running-queues">queues</a>. Size of <code>/tmp</code> partitions as of 24 Apr 2018.</td>
</tr>
</tbody>
</table>
</div>
<div id="overview-skxcomputenodes">
<h2 id="skx-compute-nodes"><a href="#overview-skxcomputenodes">SKX Compute Nodes</a></h2>
<p>Stampede2 hosts 1,736 SKX compute nodes.</p>
</div>
<div id="table2">
<p><a href="#table2">Table 2. Stampede2 SKX Compute Node Specifications</a></p>
<table border="1" cellpadding="3">
<tbody>
<tr>
<td align="right">Model:&nbsp;</td>
<td>Intel Xeon Platinum 8160 ("Skylake")</td>
</tr>
<tr>
<td align="right">Total cores per SKX node:&nbsp;</td>
<td>48 cores on two sockets (24 cores/socket)</td>
</tr>
<tr>
<td align="right" nowrap>Hardware threads per core:&nbsp;</td>
<td>2</td>
</tr>
<tr>
<td align="right" nowrap>Hardware threads per node:&nbsp;</td>
<td>48 x 2 = 96</td>
</tr>
<tr>
<td align="right">Clock rate:&nbsp;</td>
<td>2.1GHz nominal (1.4-3.7GHz depending on instruction set and number of active cores)</td>
</tr>
<tr>
<td align="right">RAM:&nbsp;</td>
<td>192GB (2.67GHz) DDR4</td>
</tr>
<tr>
<td align="right">Cache:&nbsp;</td>
<td>32KB L1 data cache per core; 1MB L2 per core; 33MB L3 per socket. Each socket can cache up to 57MB (sum of L2 and L3 capacity).</td>
</tr>
<tr>
<td align="right">Local storage:&nbsp;</td>
<td>144GB <code>/tmp</code> partition on a 200GB SSD. Size of <code>/tmp</code> partition as of 14 Nov 2017.</td>
</tr>
</tbody>
</table>
</div>
<div id="overview-loginnodes">
<h2 id="login-nodes"><a href="#overview-loginnodes">Login Nodes</a></h2>
<p>The Stampede2 login nodes, upgraded at the start of Phase 2, are Intel Xeon Gold 6132 (SKX) nodes, each with 28 cores on two sockets (14 cores/socket). They replace the decommissioned Broadwell login nodes used during Phase 1.</p>
</div>
<div id="overview-network">
<h2 id="network"><a href="#overview-network">Network</a></h2>
<p>The interconnect is a 100Gb/sec Intel Omni-Path (OPA) network with a fat tree topology employing six core switches. There is one leaf switch for each 28-node half rack, each with 20 leaf-to-core uplinks (28/20 oversubscription).</p>
</div>
<div id="overview-filesystems">
<h2 id="file-systems-introduction"><a href="#overview-filesystems">File Systems Introduction</a></h2>
<p>Stampede2 mounts three shared Lustre file systems on which each user has corresponding account-specific directories <a href="#files-filesystems"><code>$HOME</code>, <code>$WORK</code>, and <code>$SCRATCH</code></a>. Each file system is available from all Stampede2 nodes; the <a href="https://www.tacc.utexas.edu/systems/stockyard">Stockyard-hosted work file system</a> is available on most other TACC HPC systems as well. See <a href="#files-filesystems">Navigating the Shared File Systems</a> for detailed information as well as the <a href="#table-file-system-usage-recommendations">Good Citizenship</a> file system guidelines.</p>
<p class="portlet-msg-info">The <code>$SCRATCH</code> file system, as its name indicates, is a temporary storage space. Files that have not <a href="#accesstime">been accessed*</a> in ten days are subject to purge. Deliberately modifying file access time (using any method, tool, or program) for the purpose of circumventing purge policies is prohibited.</p>
</div>
<div id="table3">
<p><a href="#table3">Table 3. Stampede2 File Systems</a></p>
<table border="1" cellpadding="3">
<tbody>
<tr>
<th nowrap>File System</th>
<th>Quota</th>
<th>Key Features</th>
</tr>
<tr>
<td><code>$HOME</code></td>
<td>10GB, 200,000 files</td>
<td><b>Not intended for parallel or high-intensity file operations.</b><br>Backed up regularly.<br>Overall capacity ~1PB. Two Meta-Data Servers (MDS), four Object Storage Targets (OSTs).<br>Defaults: 1 stripe, 1MB stripe size.<br>Not purged.<br></td>
</tr>
<tr>
<td><code>$WORK</code></td>
<td>1TB, 3,000,000 files across all TACC systems,<br>regardless of where on the file system the files reside.</td>
<td><b>Not intended for high-intensity file operations or jobs involving very large files.</b><br>On the Global Shared File System that is mounted on most TACC systems.<br>See <a href="https://www.tacc.utexas.edu/systems/stockyard">Stockyard system description</a> for more information.<br>Defaults: 1 stripe, 1MB stripe size<br>Not backed up.<br>Not purged.<br></td>
</tr>
<tr>
<td><code>$SCRATCH</code></td>
<td>no quota</td>
<td>Overall capacity ~30PB. Four MDSs, 66 OSTs.<br>Defaults: 1 stripe, 1MB stripe size.<br>Not backed up.<br><b>Files are subject to purge if <a href="#accesstime">access time*</a> is more than 10 days old</b>.</td>
</tr>
</tbody>
</table>
</div>
</div>
<div id="accesstime">
<p> </p>
<p> </p>
<p>*The operating system updates a file's access time when that file is modified on a login or compute node. Reading or executing a file/script on a login node does not update the access time, but reading or executing on a compute node does update the access time. This approach helps us distinguish between routine management tasks (e.g. <code>tar</code>, <code>scp</code>) and production use. Use the command "<code>ls -ul</code>" to view access times.</p>
</div>
<div id="access">
<h1 id="accessing-the-system"><a href="#access">Accessing the System</a></h1>
<p>Access to all TACC systems now requires Multi-Factor Authentication (MFA). You can create an MFA pairing on the TACC User Portal. After login on the portal, go to your account profile (Home-&gt;Account Profile), then click the "Manage" button under "Multi-Factor Authentication" on the right side of the page. See <a href="http://portal.tacc.utexas.edu/tutorials/multifactor-authentication">Multi-Factor Authentication at TACC</a> for further information.</p>
<div id="access-ssh">
<h2 id="secure-shell-ssh"><a href="#access-ssh">Secure Shell (SSH)</a></h2>
<p>The "<code>ssh</code>" command (SSH protocol) is the standard way to connect to Stampede2. SSH also includes support for the file transfer utilities <code>scp</code> and <code>sftp</code>. <a href="https://en.wikipedia.org/wiki/Secure_Shell">Wikipedia</a> is a good source of information on SSH. SSH is available within Linux and from the terminal app in the Mac OS. If you are using Windows, you will need an SSH client that supports the SSH-2 protocol: e.g. <a href="http://www.bitvise.com">Bitvise</a>, <a href="http://www.openssh.com">OpenSSH</a>, <a href="http://www.putty.org">PuTTY</a>, or <a href="https://www.vandyke.com/products/securecrt/">SecureCRT</a>. Initiate a session using the <code>ssh</code> command or the equivalent; from the Linux command line the launch command looks like this:</p>
<pre class="cmd-line">localhost$ <b>ssh myusername@stampede2.tacc.utexas.edu</b></pre>
<p>The above command will rotate connections across all available login nodes and route your connection to one of them. To connect to a specific login node, use its full domain name:</p>
<pre class="cmd-line">localhost$ <b>ssh myusername@login2.stampede2.tacc.utexas.edu</b></pre>
<p>To connect with X11 support on Stampede2 (usually required for applications with graphical user interfaces), use the <span style="white-space: nowrap;">"<code>-X</code>"</span> or <span style="white-space: nowrap;">"<code>-Y</code>"</span> switch:</p>
<pre class="cmd-line">localhost$ <b>ssh -X myusername@stampede2.tacc.utexas.edu</b></pre>
<p>Use your TACC password, not your XSEDE password, for direct logins to TACC resources. You can change your TACC password through the <a href="http://portal.tacc.utexas.edu/">TACC User Portal</a>. Log into the portal, then select "Change Password" under the "HOME" tab. If you've forgotten your password, go to the <a href="http://portal.tacc.utexas.edu/">TACC User Portal</a> home page and select "Password Reset" under the Home tab.</p>
<p>To report a connection problem, execute the <code>ssh</code> command with the <span style="white-space: nowrap;">"<code>-vvv</code>"</span> option and include the verbose output when submitting a help ticket.</p>
<p><strong>Do not run the "<code>ssh-keygen</code>" command on Stampede2.</strong> This command will create and configure a key pair that will interfere with the execution of job scripts in the batch system. If you do this by mistake, you can recover by renaming or deleting the <code>.ssh</code> directory located in your home directory; the system will automatically generate a new one for you when you next log into Stampede2.</p>
<ol type="1">
<li>execute "<code>mv .ssh dot.ssh.old</code>"</li>
<li>log out</li>
<li>log into Stampede2 again</li>
</ol>
<p>After logging in again the system will generate a properly configured key pair.</p>
</div>
<div id="access-sso">
<h2 id="xsede-single-sign-on-hub"><a href="#access-sso">XSEDE Single Sign-On Hub</a></h2>
<p><a href="http://www.xsede.org">XSEDE</a> users can also access Stampede2 via the <a href="https://portal.xsede.org/single-sign-on-hub">XSEDE Single Sign-On Hub</a>.</p>
<p>When reporting a problem to the help desk, please execute the <code>gsissh</code> command with the <span style="white-space: nowrap;">"<code>-vvv</code>"</span> option and include the verbose output in your problem description.</p>
</div>
</div>
<div id="using">
<h1 id="using-stampede2"><a href="#using">Using Stampede2</a></h1>
<p>Stampede2 nodes run Red Hat Enterprise Linux 7. Regardless of your research workflow, <strong>you'll need to master Linux basics</strong> and a Linux-based text editor (e.g. <code>emacs</code>, <code>nano</code>, <code>gedit</code>, or <code>vi/vim</code>) to use the system properly. This user guide does not address these topics, however. There are numerous resources in a variety of formats that are available to help you learn Linux, including some listed on the <a href="https://portal.tacc.utexas.edu/training/course-materials">TACC</a> and <a href="https://portal.xsede.org/training/overview">XSEDE</a> training sites. If you encounter a term or concept in this user guide that is new to you, a quick internet search should help you resolve the matter quickly.</p>
<div id="using-account">
<h2 id="configuring-your-account"><a href="#using-account">Configuring Your Account</a></h2>
<div id="using-account-shell">
<h3 id="linux-shell"><a href="#using-account-shell">Linux Shell</a></h3>
<p>The default login shell for your user account is Bash. To determine your current login shell, execute:</p>
<pre class="cmd-line">$ <b>echo $SHELL</b></pre>
<p>If you'd like to change your login shell to <code>csh</code>, <code>sh</code>, <code>tcsh</code>, or <code>zsh</code>, submit a ticket through the <a href="http://portal.tacc.utexas.edu/">TACC</a> or <a href="http://portal.xsede.org/">XSEDE</a> portal. The "<code>chsh</code>" ("change shell") command will not work on TACC systems.</p>
<p>When you start a shell on Stampede2, system-level startup files initialize your account-level environment and aliases before the system sources your own user-level startup scripts. You can use these startup scripts to customize your shell by defining your own environment variables, aliases, and functions. These scripts (e.g. <code>.profile</code> and <code>.bashrc</code>) are generally hidden files: so-called dotfiles that begin with a period, visible when you execute: <span style="white-space: nowrap;">"<code>ls -a</code>"</span>.</p>
<p>Before editing your startup files, however, it's worth taking the time to understand the basics of how your shell manages startup. Bash startup behavior is very different from the simpler <code>csh</code> behavior, for example. The Bash startup sequence varies depending on how you start the shell (e.g. using <code>ssh</code> to open a login shell, executing the "<code>bash</code>" command to begin an interactive shell, or launching a script to start a non-interactive shell). Moreover, Bash does not automatically source your <code>.bashrc</code> when you start a login shell by using <code>ssh</code> to connect to a node. Unless you have specialized needs, however, this is undoubtedly more flexibility than you want: you will probably want your environment to be the same regardless of how you start the shell. The easiest way to achieve this is to execute <span style="white-space: nowrap;">"<code>source ~/.bashrc</code>"</span> from your "<code>.profile</code>", then put all your customizations in "<code>.bashrc</code>". The system-generated default startup scripts demonstrate this approach. We recommend that you use these default files as templates.</p>
<p>For more information see the <a href="https://portal.tacc.utexas.edu/tutorials/bashquickstart">Bash Users' Startup Files: Quick Start Guide</a> and other online resources that explain shell startup. To recover the originals that appear in a newly created account, execute <span style="white-space: nowrap;">"<code>/usr/local/startup_scripts/install_default_scripts</code>"</span>.</p>
</div>
<div id="using-account-envvars">
<h3 id="environment-variables"><a href="#using-account-envvars">Environment Variables</a></h3>
<p>Your environment includes the environment variables and functions defined in your current shell: those initialized by the system, those you define or modify in your account-level startup scripts, and those defined or modified by the <a href="#using-modules">modules</a> that you load to configure your software environment. Be sure to distinguish between an environment variable's name (e.g. <code>HISTSIZE</code>) and its value (<code>$HISTSIZE</code>). Understand as well that a sub-shell (e.g. a script) inherits environment variables from its parent, but does not inherit ordinary shell variables or aliases. Use <code>export</code> (in Bash) or <code>setenv</code> (in <code>csh</code>) to define an environment variable.</p>
<p>Execute the "<code>env</code>" command to see the environment variables that define the way your shell and child shells behave.</p>
<p>Pipe the results of <code>env</code> into <code>grep</code> to focus on specific environment variables. For example, to see all environment variables that contain the string GIT (in all caps), execute:</p>
<pre class="cmd-line">$ <b>env | grep GIT</b></pre>
<p>The environment variables <code>PATH</code> and <code>LD_LIBRARY_PATH</code> are especially important. <code>PATH</code> is a colon-separated list of directory paths that determines where the system looks for your executables. <code>LD_LIBRARY_PATH</code> is a similar list that determines where the system looks for shared libraries.</p>
</div>
<div id="using-account-diagnostics">
<h3 id="account-level-diagnostics"><a href="#using-account-diagnostics">Account-Level Diagnostics</a></h3>
<p>TACC's <code>sanitytool</code> module loads an account-level diagnostic package that detects common account-level issues and often walks you through the fixes. You should certainly run the package's <code>sanitycheck</code> utility when you encounter unexpected behavior. You may also want to run <code>sanitycheck</code> periodically as preventive maintenance. To run <code>sanitytool</code>'s account-level diagnostics, execute the following commands:</p>
<pre class="cmd-line">
login1$ <b>module load sanitytool</b>
login1$ <b>sanitycheck</b></pre>
<p>Execute "<code>module help sanitytool</code>" for more information.</p>
</div>
</div>
<div id="using-computenodes">
<h2 id="accessing-the-compute-nodes"><a href="#using-computenodes">Accessing the Compute Nodes</a></h2>
<p>You connect to Stampede2 through one of four "front-end" login nodes. The login nodes are shared resources: at any given time, there are many users logged into each of these login nodes, each preparing to access the "back-end" compute nodes (<a href="#figure2">Figure 2. Login and Compute Nodes</a>). What you do on the login nodes affects other users directly because you are competing for the same memory and processing power. This is the reason you should not run your applications on the login nodes or otherwise abuse them. Think of the login nodes as a prep area where you can manage files and compile code before accessing the compute nodes to perform research computations. See <a href="#using-citizenship">Good Citizenship</a> for more information.</p>
<p><strong>You can use your command-line prompt, or the "<code>hostname</code>" command, to tell you whether you are on a login node or a compute node</strong>. The default prompt, or any custom prompt containing "<code>\h</code>", displays the short form of the hostname (e.g. <code>c401-064</code>). The hostname for a Stampede2 login node begins with the string "<code>login</code>" (e.g. <code>login2.stampede2.tacc.utexas.edu</code>), while compute node hostnames begin with the character "<code>c</code>" (e.g. <code>c401-064.stampede2.tacc.utexas.edu</code>). Note that the default prompts on the compute nodes include the node type (<code>knl</code> or <code>skx</code>) as well. The environment variable <code>TACC_NODE_TYPE</code>, defined only on the compute nodes, also displays the node type. The simplified prompts in the User Guide examples are shorter than Stampede2's actual default prompts.</p>
<p>While some workflows, tools, and applications hide the details, there are three basic ways to access the compute nodes:</p>
<ol type="1">
<li><a href="#running-sbatch">Submit a <strong>batch job</strong> using the <code>sbatch</code> command</a>. This directs the scheduler to run the job unattended when there are resources available. Until your batch job begins it will wait in a <a href="#running-queues">queue</a>. You do not need to remain connected while the job is waiting or executing. See <a href="#running">Running Jobs</a> for more information. Note that the scheduler does not start jobs on a first come, first served basis; it juggles many variables to keep the machine busy while balancing the competing needs of all users. The best way to minimize wait time is to request only the resources you really need: the scheduler will have an easier time finding a slot for the two hours you need than for the 48 hours you unnecessarily request.</li>
<li>Begin an <a href="#running-idev"><strong>interactive session</strong> using <code>idev</code> or <code>srun</code></a>. This will log you into a compute node and give you a command prompt there, where you can issue commands and run code as if you were doing so on your personal machine. An interactive session is a great way to develop, test, and debug code. When you request an interactive session, the scheduler submits a job on your behalf. You will need to remain logged in until the interactive session begins.</li>
<li>Begin an <a href="#running-ssh">interactive session using <strong><code>ssh</code></strong></a> to connect to a compute node on which you are already running a job. This is a good way to open a second window into a node so that you can monitor a job while it runs.</li>
</ol>
<p>Be sure to request computing resources that are consistent with the type of application(s) you are running:</p>
<ul>
<li><p>A <strong>serial</strong> (non-parallel) application can only make use of a single core on a single node, and will only see that node's memory.</p></li>
<li><p>A threaded program (e.g. one that uses <strong>OpenMP</strong>) employs a shared memory programming model and is also restricted to a single node, but the program's individual threads can run on multiple cores on that node.</p></li>
<li><p>An <strong>MPI</strong> (Message Passing Interface) program can exploit the distributed computing power of multiple nodes: it launches multiple copies of its executable (MPI <strong>tasks</strong>, each assigned unique IDs called <strong>ranks</strong>) that can communicate with each other across the network. The tasks on a given node, however, can only directly access the memory on that node. Depending on the program's memory requirements, it may not be possible to run a task on every core of every node assigned to your job. If it appears that your MPI job is running out of memory, try launching it with fewer tasks per node to increase the amount of memory available to individual tasks.</p></li>
<li><p>A popular type of <strong>parameter sweep</strong> (sometimes called <strong>high throughput computing</strong>) involves submitting a job that simultaneously runs many copies of one serial or threaded application, each with its own input parameters ("Single Program Multiple Data", or SPMD). The "<code>launcher</code>" tool is designed to make it easy to submit this type of job. For more information:</p> <pre class="cmd-line">
$ <b>module load launcher</b>
$ <b>module help launcher</b></pre></li>
</ul>
</div>
<div id="figure2">
<figure>
<p><img src="/documents/10157/1475729/Login+to+compute+nodes/3283b149-527e-4fa3-99b3-c25449d468c5?t=1496615325692" style="width: 600px; height: 277px;"> </p>
<figcaption>
Figure 2. Login and compute nodes
</figcaption>
</figure>
</div>
<div id="using-modules">
<h2 id="using-modules-to-manage-your-environment"><a href="#using-modules">Using Modules to Manage your Environment</a></h2>
<p>Lmod, a module system developed and maintained at TACC, makes it easy to manage your environment so you have access to the software packages and versions that you need to conduct your research. This is especially important on a system like Stampede2 that serves thousands of users with an enormous range of needs. Loading a module amounts to choosing a specific package from among available alternatives:</p>
<pre class="cmd-line">
$ <b>module load intel</b> # load the default Intel compiler
$ <b>module load intel/17.0.4</b> # load a specific version of Intel compiler</pre>
<p>A module does its job by defining or modifying environment variables (and sometimes aliases and functions). For example, a module may prepend appropriate paths to <code>$PATH</code> and <code>$LD_LIBRARY_PATH</code> so that the system can find the executables and libraries associated with a given software package. The module creates the illusion that the system is installing software for your personal use. Unloading a module reverses these changes and creates the illusion that the system just uninstalled the software:</p>
<pre class="cmd-line">
$ <b>module load ddt</b> # defines DDT-related env vars; modifies others
$ <b>module unload ddt</b> # undoes changes made by load</pre>
<p>The module system does more, however. When you load a given module, the module system can automatically replace or deactivate modules to ensure the packages you have loaded are compatible with each other. In the example below, the module system automatically unloads one compiler when you load another, and replaces Intel-compatible versions of IMPI and PETSc with versions compatible with gcc:</p>
<pre class="cmd-line">
$ <b>module load intel</b> # load default version of Intel compiler
$ <b>module load petsc</b> # load default version of PETSc
$ <b>module load gcc</b> # change compiler
Lmod is automatically replacing "intel/17.0.4" with "gcc/7.1.0".
Due to MODULEPATH changes, the following have been reloaded:
1) impi/17.0.3 2) petsc/3.7</pre>
<p>On Stampede2, modules generally adhere to a TACC naming convention when defining environment variables that are helpful for building and running software. For example, the "<code>papi</code>" module defines <code>TACC_PAPI_BIN</code> (the path to PAPI executables), <code>TACC_PAPI_LIB</code> (the path to PAPI libraries), <code>TACC_PAPI_INC</code> (the path to PAPI include files), and <code>TACC_PAPI_DIR</code> (top-level PAPI directory). After loading a module, here are some easy ways to observe its effects:</p>
<pre class="cmd-line">
$ <b>module show papi</b> # see what this module does to your environment
$ <b>env | grep PAPI</b> # see env vars that contain the string PAPI
$ <b>env | grep -i papi</b> # case-insensitive search for 'papi' in environment</pre>
<p>To see the modules you currently have loaded:</p>
<pre class="cmd-line">
$ <b>module list</b></pre>
<p>To see all modules that you can load right now because they are compatible with the currently loaded modules:</p>
<pre class="cmd-line">
$ <b>module avail</b></pre>
<p>To see all installed modules, even if they are not currently available because they are incompatible with your currently loaded modules:</p>
<pre class="cmd-line">
$ <b>module spider</b> # list all modules, even those not available to load</pre>
<p>To filter your search:</p>
<pre class="cmd-line">
$ <b>module spider slep</b> # all modules with names containing 'slep'
$ <b>module spider sundials/2.5.0</b> # additional details on a specific module</pre>
<p>Among other things, the latter command will tell you which modules you need to load before the module is available to load. You might also search for modules that are tagged with a keyword related to your needs (though your success here depends on the diligence of the module writers). For example:</p>
<pre class="cmd-line">
$ <b>module keyword performance</b></pre>
<p>You can save a collection of modules as a personal default collection that will load every time you log into Stampede2. To do so, load the modules you want in your collection, then execute:</p>
<pre class="cmd-line">
$ <b>module save</b> # save the currently loaded collection of modules </pre>
<p>Two commands make it easy to return to a known, reproducible state:</p>
<pre class="cmd-line">
$ <b>module reset</b> # load the system default collection of modules
$ <b>module restore</b> # load your personal default collection of modules</pre>
<p>On TACC systems, the command "<code>module reset</code>" is equivalent to "<code>module purge; module load TACC</code>". It's a safer, easier way to get to a known baseline state than issuing the two commands separately.</p>
<p>Help text is available for both individual modules and the module system itself:</p>
<pre class="cmd-line">
$ <b>module help swr</b> # show help text for software package swr
$ <b>module help</b> # show help text for the module system itself</pre>
<p>See <a href="http://lmod.readthedocs.org">Lmod's online documentation</a> for more extensive documentation. The online documentation addresses the basics in more detail, but also covers several topics beyond the scope of the help text (e.g. writing and using your own module files).</p>
<p>It's safe to execute module commands in job scripts. In fact, this is a good way to write self-documenting, portable job scripts that produce reproducible results. If you use <span style="white-space: nowrap;">"<code>module save</code>"</span> to define a personal default module collection, it's rarely necessary to execute module commands in shell startup scripts, and it can be tricky to do so safely. If you do wish to put module commands in your startup scripts, see Stampede2's default startup scripts for a safe way to do so.</p>
</div>
</div>
<div id="citizenship">
<h1 id="citizenship"><a href="#citizenship">Citizenship</a></h1>
<p><strong>You share Stampede2 with many, sometimes hundreds, of other users</strong>, and what you do on the system affects others. All users must follow a set of good practices which entail limiting activities that may impact the system for other users. Exercise good citizenship to ensure that your activity does not adversely impact the system and the research community with whom you share it.</p>
<p>TACC staff has developed the following guidelines to good citizenship on Stampede2. Please familiarize yourself especially with the first two mandates. The next sections discuss best practices on <a href="#citizenship-io">limiting and minimizing I/O activity</a> and <a href="#citizenship-filesystems">file transfers</a>. And finally, we provide <a href="#citizenship-jobs">job submission tips</a> when constructing job scripts to help minimize wait times in the queues.</p>
<ul>
<li><a href="#citizenship-loginnodes">Do Not Run Jobs on the Login Nodes</a></li>
<li><a href="#citizenship-filesystems">Do Not Stress the File Systems</a></li>
<li><a href="#citizenship-io">Limit Input/Output (I/O) Activity</a></li>
<li><a href="#citizenship-transfers">File Transfer Guidelines</a></li>
<li><a href="#citizenship-jobs">Job Submission Tips</a></li>
</ul>
<div id="citizenship-loginnodes">
<h2 id="do-not-run-jobs-on-the-login-nodes"><a href="#citizenship-loginnodes">Do Not Run Jobs on the Login Nodes</a></h2>
<p>Stampede2's few login nodes are shared among all users. Dozens, (sometimes hundreds) of users may be logged on at one time accessing the file systems. Think of the login nodes as a prep area, where users may edit and manage files, compile code, perform file management, issue transfers, submit new and track existing batch jobs etc. The login nodes provide an interface to the "back-end" compute nodes.</p>
<p>The compute nodes are where actual computations occur and where research is done. Hundreds of jobs may be running on all compute nodes, with hundreds more queued up to run. All batch jobs and executables, as well as development and debugging sessions, must be run on the compute nodes. To access compute nodes on TACC resources, one must either <a href="#running-sbatch">submit a job to a batch queue</a> or initiate an interactive session using the <a href="#running-idev"><code>idev</code></a> utility.</p>
<p>A single user running computationally expensive or disk intensive task/s will negatively impact performance for other users. Running jobs on the login nodes is one of the fastest routes to account suspension. Instead, run on the compute nodes via an interactive session (<a href="/software/idev"><code>idev</code></a>) or by <a href="#running">submitting a batch job</a>.</p>
<p class="portlet-msg-alert"> Do not run jobs or perform intensive computational activity on the login nodes or the shared file systems.<br>Your account may be suspended and you will lose access to the queues if your jobs are impacting other users. </p>
<div id="citizenship-loginnodes-examples">
<h3 id="dos-donts-on-the-login-nodes"><a href="#citizenship-loginnodes-examples">Dos &amp; Don'ts on the Login Nodes</a></h3>
<ul>
<li><p><strong>Do not run research applications on the login nodes;</strong> this includes frameworks like MATLAB and R, as well as computationally or I/O intensive Python scripts. If you need interactive access, use the <code>idev</code> utility or Slurm's <code>srun</code> to schedule one or more compute nodes.</p> <p>DO THIS: Start an interactive session on a compute node and run Matlab.</p> <pre class="cmd-line">
login1$ <b>idev</b>
nid00181$ <b>matlab</b></pre> <p>DO NOT DO THIS: Run Matlab or other software packages on a login node</p> <pre class="cmd-line"><s>login1$ <b>matlab</b></s></pre></li>
<li><p><strong>Do not launch too many simultaneous processes;</strong> while it's fine to compile on a login node, a command like "<span style="white-space: nowrap;"><code>make -j 16</code></span>" (which compiles on 16 cores) may impact other users.</p> <p>DO THIS: build and submit a batch job. All batch jobs run on the compute nodes.</p> <pre class="cmd-line">
login1$ <b>make <i>mytarget</i></b>
login1$ <b>sbatch <i>myjobscript</i></b></pre> <p>DO NOT DO THIS: Invoke multiple build sessions.</p> <pre class="cmd-line">login1$ <s><b>make -j 12</b></s></pre> <p>DO NOT DO THIS: Run an executable on a login node.</p> <pre class="cmd-line">
login1$ <s><b>./myprogram</b></s></pre></li>
<li><p><strong>That script you wrote to poll job status should probably do so once every few minutes rather than several times a second.</strong></p></li>
</ul>
</div>
</div>
<div id="citizenship-filesystems">
<h2 id="do-not-stress-the-shared-file-systems"><a href="#citizenship-filesystems">Do Not Stress the Shared File Systems</a></h2>
<p>The TACC Global Shared File System, Stockyard, is mounted on most TACC HPC resources as the <code>/work</code> (<code>$WORK</code>) directory. This file system is accessible to all TACC users, and therefore experiences a lot of I/O activity (reading and writing to disk, opening and closing files) as users run their jobs, read and generate data including intermediate and checkpointing files. As TACC adds more users, the stress on the <code>$WORK</code> file system is increasing to the extent that TACC staff is now recommending new job submission guidelines in order to reduce stress and I/O on Stockyard.</p>
<p><strong>TACC staff now recommends that you run your jobs out of the <code>$SCRATCH</code> file system instead of the global <code>$WORK</code> file system.</strong></p>
<p>To run your jobs out <code>$SCRATCH</code>:</p>
<ul>
<li>Copy or move all job input files to <code>$SCRATCH</code></li>
<li>Make sure your job script directs all output to <code>$SCRATCH</code><br> </li>
<li>Once your job is finished, move your output files to <code>$WORK</code> to avoid any data purges.</li>
</ul>
<p class="portlet-msg-alert"> Compute nodes should not reference <code>$WORK</code> unless it's to stage data in/out only before/after jobs. </p>
<p>Consider that <code>$HOME</code> and <code>$WORK</code> are for storage and keeping track of important items. Actual job activity, reading and writing to disk, should be offloaded to your resource's <code>$SCRATCH</code> file system (see <a href="#table-file-system-usage-recommendations">Table. File System Usage Recommendations</a>. You can start a job from anywhere but the actual work of the job should occur only on the <code>$SCRATCH</code> partition. You can save original items to <code>$HOME</code> or <code>$WORK</code> so that you can copy them over to <code>$SCRATCH</code> if you need to re-generate results.</p>
<div id="citizenship-filesystems-tips">
<h3 id="more-file-system-tips"><a href="#citizenship-filesystems-tips">More File System Tips</a></h3>
<ul>
<li><p><strong>Don't run jobs in your <code>$HOME</code> directory.</strong> The <code>$HOME</code> file system is for routine file management, not parallel jobs.</p></li>
<li><p><strong>Watch all your <a href="#files">file system quotas</a>.</strong> If you're near your quota in <code>$WORK</code> and your job is repeatedly trying (and failing) to write to <code>$WORK</code>, you will stress that file system. If you're near your quota in <code>$HOME</code>, jobs run on any file system may fail, because all jobs write some data to the hidden <code>$HOME/.slurm</code> directory.</p></li>
<li><p><strong>Avoid storing many small files in a single directory, and avoid workflows that require many small files</strong>. A few hundred files in a single directory is probably fine; tens of thousands is almost certainly too many. If you must use many small files, group them in separate directories of manageable size.</p></li>
<li><p>TACC resources, with a few exceptions, mount three file systems: <code>/home</code>, <code>/work</code> and <code>/scratch</code>. Please follow each file system's recommended usage.</p></li>
</ul>
</div>
<div id="table-file-system-usage-recommendations">
<h3 id="file-system-usage-recommendations"><a href="#table-file-system-usage-recommendations">File System Usage Recommendations</a></h3>
</div>
</div>
<table border="1" cellpadding="5" cellspacing="3">
<tbody>
<tr>
<th>File System</th>
<th>Best Storage Practices</th>
<th>Best Activities</th>
</tr>
<tr>
<td><code>$HOME</code></td>
<td>cron jobs<br>small scripts<br>environment settings</td>
<td>compiling, editing</td>
</tr>
<tr>
<td><code>$WORK</code></td>
<td>store software installations<br> original datasets that can't be reproduced<br> job scripts and templates</td>
<td>staging datasets</td>
</tr>
<tr>
<td><code>$SCRATCH</code></td>
<td><b>Temporary Storage</b><br>I/O files<br>job files<br>temporary datasets</td>
<td>all job I/O activity</td>
</tr>
</tbody>
</table>
<p class="portlet-msg-info">The <code>$SCRATCH</code> file system, as its name indicates, is a temporary storage space. Files that have not <a href="#accesstime">been accessed*</a> in ten days are subject to purge. Deliberately modifying file access time (using any method, tool, or program) for the purpose of circumventing purge policies is prohibited.</p>
<div id="citizenship-io">
<h2 id="limit-inputoutput-io-activity"><a href="#citizenship-io">Limit Input/Output (I/O) Activity</a></h2>
<p>In addition to the file system tips above, it's important that your jobs limit all I/O activity. This section focuses on ways to avoid causing problems on each resources' shared file systems.</p>
<ul>
<li><p><strong>Limit I/O intensive sessions</strong> (lots of reads and writes to disk, rapidly opening or closing many files)</p></li>
<li><p><strong>Avoid opening and closing files repeatedly</strong> in tight loops. Every open/close operation on the file system requires interaction with the MetaData Service (MDS). The MDS acts as a gatekeeper for access to files on Lustre's parallel file system. Overloading the MDS will affect other users on the system. If possible, open files once at the beginning of your program/workflow, then close them at the end.</p></li>
<li><p><strong>Don't get greedy.</strong> If you know or suspect your workflow is I/O intensive, don't submit a pile of simultaneous jobs. Writing restart/snapshot files can stress the file system; avoid doing so too frequently. Also, use the <code>hdf5</code> or <code>netcdf</code> libraries to generate a single restart file in parallel, rather than generating files from each process separately.</p></li>
</ul>
<p class="portlet-msg-alert"> If you know your jobs will require significant I/O, please submit a support ticket and an HPC consultant will work with you. See also <a href="/tutorials/managingio">Managing I/O on TACC Resources</a> for additional information. </p>
</div>
<div id="citizenship-transfers">
<h2 id="file-transfer-guidelines"><a href="#citizenship-transfers">File Transfer Guidelines</a></h2>
<p>In order to not stress both internal and external networks, be mindful of the following guidelines:</p>
<ul>
<li><p>When creating or transferring <strong>large files</strong> to Stockyard (<code>$WORK</code>) or the <code>$SCRATCH</code> file systems, <strong>be sure to stripe the receiving directories appropriately</strong>. See <a href="#files-striping">Striping Large Files</a> for more information.</p></li>
<li><p><strong>Avoid too many simultaneous file transfers</strong>. You share the network bandwidth with other users; don't use more than your fair share. Two or three concurrent <code>scp</code> sessions is probably fine. Twenty is probably not.</p></li>
<li><p><strong>Avoid recursive file transfers</strong>, especially those involving many small files. Create a <code>tar</code> archive before transfers. This is especially true when transferring files to or from <a href="http://portal.tacc.utexas.edu/user-guides/ranch">Ranch</a>.</p></li>
</ul>
</div>
<div id="citizenship-jobs">
<h2 id="job-submission-tips"><a href="#citizenship-jobs">Job Submission Tips</a></h2>
<ul>
<li><p><strong>Request Only the Resources You Need</strong> Make sure your job scripts request only the resources that are needed for that job. Don't ask for more time or more nodes than you really need. The scheduler will have an easier time finding a slot for a job requesting 2 nodes for 2 hours, than for a job requesting 4 nodes for 24 hours. This means shorter queue waits times for you and everybody else.</p></li>
<li><p><strong>Test your submission scripts.</strong> Start small: make sure everything works on 2 nodes before you try 20. Work out submission bugs and kinks with 5 minute jobs that won't wait long in the queue and involve short, simple substitutes for your real workload: simple test problems; <span style="white-space: nowrap;"><code>hello world</code></span> codes; one-liners like <span style="white-space: nowrap;"><code>ibrun hostname</code></span>; or an <code>ldd</code> on your executable.</p></li>
<li><p><strong>Respect memory limits and other system constraints.</strong> If your application needs more memory than is available, your job will fail, and may leave nodes in unusable states. Use TACC's <a href="/software/remora">Remora</a> tool to monitor your application's needs.</p></li>
</ul>
</div>
</div>
<div id="files">
<h1 id="managing-your-files"><a href="#files">Managing Your Files</a></h1>
<p>Stampede2 mounts three file Lustre file systems that are shared across all nodes: the home, work, and scratch file systems. Stampede2's startup mechanisms define corresponding account-level environment variables <code>$HOME</code>, <code>$SCRATCH</code>, and <code>$WORK</code> that store the paths to directories that you own on each of these file systems. Consult the <a href="#table3">Stampede2 File Systems</a> table for the basic characteristics of these file systems, <a href="#programming-fileio">File Operations: I/O Performance</a> for advice on performance issues, and <a href="#citizenship-filesystems-tips">Good Citizenship</a> for tips on file system etiquette.</p>
<div id="files-filesystems">
<h2 id="navigating-the-shared-file-systems"><a href="#files-filesystems">Navigating the Shared File Systems</a></h2>
<p>Stampede2's <code>/home</code> and <code>/scratch</code> file systems are mounted only on Stampede2, but the work file system mounted on Stampede2 is the Global Shared File System hosted on <a href="https://www.tacc.utexas.edu/systems/stockyard">Stockyard</a>. Stockyard is the same work file system that is currently available on Frontera, Lonestar5, and several other TACC resources.</p>
<p>The <code>$STOCKYARD</code> environment variable points to the highest-level directory that you own on the Global Shared File System. The definition of the <code>$STOCKYARD</code> environment variable is of course account-specific, but you will see the same value on all TACC systems that provide access to the Global Shared File System. This directory is an excellent place to store files you want to access regularly from multiple TACC resources.</p>
<p>Your account-specific <code>$WORK</code> environment variable varies from system to system and is a sub-directory of <code>$STOCKYARD</code> (<a href="#figure3">Figure 3</a>). The sub-directory name corresponds to the associated TACC resource. The <code>$WORK</code> environment variable on Stampede2 points to the <code>$STOCKYARD/stampede2</code> subdirectory, a convenient location for files you use and jobs you run on Stampede2. Remember, however, that all subdirectories contained in your <code>$STOCKYARD</code> directory are available to you from any system that mounts the file system. If you have accounts on both Stampede2 and Maverick, for example, the <code>$STOCKYARD/stampede2</code> directory is available from your Maverick account, and <code>$STOCKYARD/maverick</code> is available from your Stampede2 account.</p>
<p class="portlet-msg-alert"> Your quota and reported usage on the Global Shared File System reflects all files that you own on Stockyard, regardless of their actual location on the file system. </p>
<p>See the example for fictitious user <code>bjones</code> in the figure below. All directories are accessible from all systems, however a given sub-directory (e.g. <code>lonestar5</code>, <code>maverick2</code>) will exist <strong>only</strong> if you have an allocation on that system.</p>
</div>
<div id="figure3">
<figure>
<img alt="Stockyard Work file system" src="/documents/10157/1181317/Stockyard+filesystem+updated/8c7e3389-573c-4e12-9636-bf0856f3a389?t=1555952835748" style="width: 800px; height: 184px;">
<figcaption> <strong>Figure 3.</strong>
<br>Account-level directories on the work file system (Global Shared File System hosted on Stockyard). Example for fictitious user <code>bjones</code>. All directories usable from all systems. Sub-directories (e.g. <code>wrangler</code>, <code>maverick2</code>) exist only when you have allocations on the associated system.
</figcaption>
</figure>
<p>Note that resource-specific <span style="white-space: nowrap;">sub-directories</span> of <code>$STOCKYARD</code> are nothing more than convenient ways to manage your <span style="white-space: nowrap;">resource-specific</span> files. You have access to any such <span style="white-space: nowrap;">sub-directory</span> from any TACC resources. If you are logged into Stampede2, for example, executing the alias <code>cdw</code> (equivalent to <span style="white-space: nowrap;">"<code>cd $WORK</code>"</span>) will take you to the <span style="white-space: nowrap;">resource-specific</span> <span style="white-space: nowrap;">sub-directory</span> <code>$STOCKYARD/stampede2</code>. But you can access this directory from other TACC systems as well by executing <span style="white-space: nowrap;">"<code>cd $STOCKYARD/stampede2</code>"</span>. These commands allow you to share files across TACC systems. In fact, several convenient <span style="white-space: nowrap;">account-level</span> aliases make it even easier to navigate across the directories you own in the shared file systems:</p>
</div>
<div id="table4">
<p><a href="#table4">Table 4. Built-in Account Level Aliases</a></p>
<table border="1" cellpadding="3">
<tbody>
<tr>
<th colspan="2">Built-in Account Level Aliases</th>
</tr>
<tr>
<th>Alias</th>
<th>Command</th>
</tr>
<tr>
<td><code>cd</code> or <code>cdh</code></td>
<td><code>cd $HOME</code></td>
</tr>
<tr>
<td><code>cdw</code></td>
<td><code>cd $WORK</code></td>
</tr>
<tr>
<td><code>cds</code></td>
<td><code>cd $SCRATCH</code></td>
</tr>
<tr>
<td><code>cdy</code> or <code>cdg</code></td>
<td><code>cd $STOCKYARD</code></td>
</tr>
</tbody>
</table>
</div>
<div id="files-striping">
<h2 id="striping-large-files"><a href="#files-striping">Striping Large Files</a></h2>
<p>Stampede2's Lustre file systems look and act like a single logical hard disk, but are actually sophisticated integrated systems involving many physical drives (dozens of physical drives for <code>$HOME</code>, hundreds for <code>$WORK</code> and <code>$SCRATCH</code>).</p>
<p>Lustre can <strong>stripe</strong> (distribute) large files over several physical disks, making it possible to deliver the high performance needed to service input/output (I/O) requests from hundreds of users across thousands of nodes. Object Storage Targets (OSTs) manage the file system's spinning disks: a file with 16 stripes, for example, is distributed across 16 OSTs. One designated Meta-Data Server (MDS) tracks the OSTs assigned to a file, as well as the file's descriptive data.</p>
<p class="portlet-msg-alert"> Before transferring to, or creating large files on Stampede2, be sure to set an appropriate default stripe count on the receiving directory. </p>
<p>To avoid exceeding your fair share of any given OST, a good rule of thumb is to allow at least one stripe for each 100GB in the file. For example, to set the default stripe count on the current directory to 30 (a plausible stripe count for a directory receiving a file approaching 3TB in size), execute:</p>
<pre class="cmd-line">$ <b>lfs setstripe -c 30 $PWD</b></pre>
<p>Note that an "<code>lfs setstripe</code>" command always sets both stripe count and stripe size, even if you explicitly specify only one or the other. Since the example above does not explicitly specify stripe size, the command will set the stripe size on the directory to Stampede2's system default (1MB). In general there's no need to customize stripe size when creating or transferring files.</p>
<p>Remember that it's not possible to change the striping on a file that already exists. Moreover, the "<code>mv</code>" command has no effect on a file's striping if the source and destination directories are on the same file system. You can, of course, use the "<code>cp</code>" command to create a second copy with different striping; to do so, copy the file to a directory with the intended stripe parameters.</p>
<p>You can check the stripe count of a file using the "<code>lfs getstripe</code>" command:</p>
<pre class="cmd-line">$ <b>lfs getstripe <i>myfile</i></b></pre>
</div>
</div>
<div id="transferring">
<h1 id="transferring-files"><a href="#transferring">Transferring Files</a></h1>
<div id="transferring-scp">
<h2 id="transfer-using-scp"><a href="#transferring-scp">Transfer Using <code>scp</code></a></h2>
<p>You can transfer files between Stampede2 and Linux-based systems using either <a href="http://linux.com/learn/intro-to-linux/2017/2/how-securely-transfer-files-between-servers-scp"><code>scp</code></a> or <a href="http://linux.com/learn/get-know-rsync"><code>rsync</code></a>. Both <code>scp</code> and <code>rsync</code> are available in the Mac Terminal app. Windows <a href="http://portal.tacc.utexas.edu/user-guides/stampede2#secure-shell-ssh">ssh clients</a> typically include <code>scp</code>-based file transfer capabilities.</p>
<p>The Linux <code>scp</code> (secure copy) utility is a component of the OpenSSH suite. Assuming your Stampede2 username is <code>bjones</code>, a simple <code>scp</code> transfer that pushes a file named "<code>myfile</code>" from your local Linux system to Stampede2 <code>$HOME</code> would look like this:</p>
<pre class="cmd-line">localhost$ <b>scp ./myfile bjones@stampede2.tacc.utexas.edu:</b> # note colon after net address</pre>
<p>You can use wildcards, but you need to be careful about when and where you want wildcard expansion to occur. For example, to push all files ending in "<code>.txt</code>" from the current directory on your local machine to <code>/work/01234/bjones/scripts</code> on Stampede2:</p>
<pre class="cmd-line">localhost$ <b>scp *.txt bjones@stampede2.tacc.utexas.edu:/work/01234/bjones/stampede2</b></pre>
<p>To delay wildcard expansion until reaching Stampede2, use a backslash ("<code>\</code>") as an escape character before the wildcard. For example, to pull all files ending in "<code>.txt</code>" from <code>/work/01234/bjones/scripts</code> on Stampede2 to the current directory on your local system:</p>
<pre class="cmd-line">localhost$ <b>scp bjones@stampede2.tacc.utexas.edu:/work/01234/bjones/stampede2/\*.txt .</b></pre>
<p>You can of course use shell or environment variables in your calls to <code>scp</code>. For example:</p>
<pre class="cmd-line">
localhost$ <b>destdir="/work/01234/bjones/stampede2/data"</b>
localhost$ <b>scp ./myfile bjones@stampede2.tacc.utexas.edu:$destdir</b></pre>
<p>You can also issue <code>scp</code> commands on your local client that use Stampede2 environment variables like <code>$HOME</code>, <code>$WORK</code>, and <code>$SCRATCH</code>. To do so, use a backslash ("<code>\</code>") as an escape character before the "<code>$</code>"; this ensures that expansion occurs after establishing the connection to Stampede2:</p>
<pre class="cmd-line">localhost$ <b>scp ./myfile bjones@stampede2.tacc.utexas.edu:\$WORK/data</b> # Note backslash</pre>
<p>Avoid using <code>scp</code> for recursive ("<code>-r</code>") transfers of directories that contain nested directories of many small files:</p>
<pre class="cmd-line">localhost$ <s><b>scp -r ./mydata bjones@stampede2.tacc.utexas.edu:\$WORK</b></s> # DON'T DO THIS</pre>
<p>Instead, use <code>tar</code> to create an archive of the directory, then transfer the directory as a single file:</p>
<pre class="cmd-line">
localhost$ <b>tar cvf ./mydata.tar mydata</b> # create archive
localhost$ <b>scp ./mydata.tar bjones@stampede2.tacc.utexas.edu:\$WORK</b> # transfer archive</pre>
</div>
<div id="transferring-rsync">
<h2 id="transfer-using-rsync"><a href="#transferring-rsync">Transfer Using <code>rsync</code></a></h2>
<p>The <code>rsync</code> (remote synchronization) utility is a great way to synchronize files that you maintain on more than one system: when you transfer files using <code>rsync</code>, the utility copies only the changed portions of individual files. As a result, <code>rsync</code> is especially efficient when you only need to update a small fraction of a large dataset. The basic syntax is similar to <code>scp</code>:</p>
<pre class="cmd-line">
localhost$ <b>rsync mybigfile bjones@stampede2.tacc.utexas.edu:\$WORK/data</b>
localhost$ <b>rsync -avtr mybigdir bjones@stampede2.tacc.utexas.edu:\$WORK/data</b></pre>
<p>The options on the second transfer are typical and appropriate when synching a directory: this is a recursive update ("<code>-r</code>") with verbose ("<code>-v</code>") feedback; the synchronization preserves time stamps ("<code>-t</code>") as well as symbolic links and other meta-data ("<code>-a</code>"). Because <code>rsync</code> only transfers changes, recursive updates with <code>rsync</code> may be less demanding than an equivalent recursive transfer with <code>scp</code>.</p>
<p>See <a href="#files-striping">Striping Large Files</a> for additional important advice about striping the receiving directory when transferring or creating large files on TACC systems.</p>
<p>As detailed in the <a href="#citizenship">Citizenship</a> section above, it is important to monitor your quotas on the <code>$HOME</code> and <code>$WORK</code> file systems, and limit the number of simultaneous transfers. Remember also that <code>$STOCKYARD</code> (and your <code>$WORK</code> directory on each TACC resource) is available from several other TACC systems: there's no need for <code>scp</code> when both the source and destination involve sub-directories of <code>$STOCKYARD</code>. See <a href="#files">Managing Your Files</a> for more information about transfers on <code>$STOCKYARD</code>.</p>
</div>
<div id="transferring-globus">
<h2 id="transfer-using-globus"><a href="#transferring-globus">Transfer Using Globus</a></h2>
<p><a href="https://www.globus.org/how-it-works">Globus</a> is another way for XSEDE users to transfer data between XSEDE sites; see <a href="https://portal.xsede.org/software/globus">Globus at XSEDE</a> and <a href="https://portal.xsede.org/data-management">Data Transfer and Management</a> for more information. You can also use <a href="https://portal.tacc.utexas.edu/tutorials/globus">Globus</a> if you're affiliated with an institution like the University of Texas that provides access to <a href="http://cilogon.org">CILogin</a>.</p>
</div>
<div id="files-sharing">
<h2 id="sharing-files-with-collaborators"><a href="#files-sharing">Sharing Files with Collaborators</a></h2>
<p>If you wish to share files and data with collaborators in your project, see <a href="http://portal.tacc.utexas.edu/tutorials/sharing-project-files">Sharing Project Files on TACC Systems</a> for step-by-step instructions. Project managers or delegates can use Unix group permissions and commands to create read-only or read-write shared workspaces that function as data repositories and provide a common work area to all project members.</p>
</div>
</div>
<div id="building">
<h1 id="building-software"><a href="#building">Building Software</a></h1>
<p>The phrase "building software" is a common way to describe the process of producing a machine-readable executable file from source files written in C, Fortran, or some other programming language. In its simplest form, building software involves a simple, one-line call or short shell script that invokes a compiler. More typically, the process leverages the power of <a href="http://www.gnu.org/software/make/manual/make.html"><code>makefiles</code></a>, so you can change a line or two in the source code, then rebuild in a systematic way only the components affected by the change. Increasingly, however, the build process is a sophisticated multi-step automated workflow managed by a special framework like <a href="http://www.gnu.org/software/automake/manual/html_node/Autotools-Introduction.html">autotools</a> or <a href="http://cmake.org"><code>cmake</code></a>, intended to achieve a repeatable, maintainable, portable mechanism for installing software across a wide range of target platforms.</p>
<div id="building-basics">
<h2 id="basics-of-building-software"><a href="#building-basics">Basics of Building Software</a></h2>
<p>This section of the user guide does nothing more than introduce the big ideas with simple one-line examples. You will undoubtedly want to explore these concepts more deeply using online resources. You will quickly outgrow the examples here. We recommend that you master the basics of makefiles as quickly as possible: even the simplest computational research project will benefit enormously from the power and flexibility of a makefile-based build process.</p>
<div id="building-basics-intel">
<h3 id="intel-compilers"><a href="#building-basics-intel">Intel Compilers</a></h3>
<p>Intel is the recommended and default compiler suite on Stampede2. Each Intel module also gives you direct access to <code>mkl</code> without loading an <code>mkl</code> module; see <a href="#intel-math-kernel-library-mkl">Intel MKL</a> for more information. Here are simple examples that use the Intel compiler to build an executable from source code:</p>
<pre class="cmd-line">
$ <b>icc mycode.c</b> # C source file; executable a.out
$ <b>icc main.c calc.c analyze.c</b> # multiple source files
$ <b>icc mycode.c -o myexe</b> # C source file; executable myexe
$ <b>icpc mycode.cpp -o myexe</b> # C++ source file
$ <b>ifort mycode.f90 -o myexe</b> # Fortran90 source file</pre>
<p>Compiling a code that uses OpenMP would look like this:</p>
<pre class="cmd-line">$ <b>icc -qopenmp mycode.c -o myexe</b> # OpenMP</pre>
<p>See the published Intel documentation, available both <a href="http://software.intel.com/en-us/intel-software-technical-documentation">online</a> and in <code>${TACC_INTEL_DIR}/documentation</code>, for information on optimization flags and other Intel compiler options.</p>
</div>
<div id="building-basics-gnu">
<h3 id="gnu-compilers"><a href="#building-basics-gnu">GNU Compilers</a></h3>
<p>The GNU foundation maintains a number of high quality compilers, including a compiler for C (<code>gcc</code>), C++ (<code>g++</code>), and Fortran (<code>gfortran</code>). The <code>gcc</code> compiler is the foundation underneath all three, and the term "gcc" often means the suite of these three GNU compilers.</p>
<p>Load a <code>gcc</code> module to access a recent version of the GNU compiler suite. Avoid using the GNU compilers that are available without a <code>gcc</code> module — those will be older versions based on the "system gcc" that comes as part of the Linux distribution.</p>
<p>Here are simple examples that use the GNU compilers to produce an executable from source code:</p>
<pre class="cmd-line">
$ <b>gcc mycode.c</b> # C source file; executable a.out
$ <b>gcc mycode.c -o myexe</b> # C source file; executable myexe
$ <b>g++ mycode.cpp -o myexe</b> # C++ source file
$ <b>gfortran mycode.f90 -o myexe</b> # Fortran90 source file
$ <b>gcc -fopenmp mycode.c -o myexe</b> # OpenMP; GNU flag is different than Intel</pre>
<p>Note that some compiler options are the same for both Intel and GNU (e.g. "<code>-o</code>"), while others are different (e.g. "<code>-qopenmp</code>" vs "<code>-fopenmp</code>"). Many options are available in one compiler suite but not the other. See the <a href="http://gcc.gnu.org/onlinedocs/">online GNU documentation</a> for information on optimization flags and other GNU compiler options.</p>
</div>
<div id="building-basics-complink">
<h3 id="compiling-and-linking-as-separate-steps"><a href="#building-basics-complink">Compiling and Linking as Separate Steps</a></h3>
<p>Building an executable requires two separate steps: (1) compiling (generating a binary object file associated with each source file); and (2) linking (combining those object files into a single executable file that also specifies the libraries that executable needs). The examples in the previous section accomplish these two steps in a single call to the compiler. When building more sophisticated applications or libraries, however, it is often necessary or helpful to accomplish these two steps separately.</p>
<p>Use the "<code>-c</code>" ("compile") flag to produce object files from source files:</p>
<pre class="cmd-line">$ <b>icc -c main.c calc.c results.c</b></pre>
<p>Barring errors, this command will produce object files <code>main.o</code>, <code>calc.o</code>, and <code>results.o</code>. Syntax for other compilers Intel and GNU compilers is similar.</p>
<p>You can now link the object files to produce an executable file:</p>
<pre class="cmd-line">$ <b>icc main.o calc.o results.o -o myexe</b></pre>
<p>The compiler calls a linker utility (usually <code>/bin/ld</code>) to accomplish this task. Again, syntax for other compilers is similar.</p>
</div>
<div id="building-basics-inclib">
<h3 id="include-and-library-paths"><a href="#building-basics-inclib">Include and Library Paths</a></h3>
<p>Software often depends on pre-compiled binaries called libraries. When this is true, compiling usually requires using the "<code>-I</code>" option to specify paths to so-called header or include files that define interfaces to the procedures and data in those libraries. Similarly, linking often requires using the "<code>-L</code>" option to specify paths to the libraries themselves. Typical compile and link lines might look like this:</p>
<pre class="cmd-line">
$ <b>icc -c main.c -I${WORK}/mylib/inc -I${TACC_HDF5_INC}</b> # compile
$ <b>icc main.o -o myexe -L${WORK}/mylib/lib -L${TACC_HDF5_LIB} -lmylib -lhdf5</b> # link</pre>
<p>On Stampede2, both the <code>hdf5</code> and <code>phdf5</code> modules define the environment variables <code>$TACC_HDF5_INC</code> and <code>$TACC_HDF5_LIB</code>. Other module files define similar environment variables; see <a href="#using-modules">Using Modules to Manage Your Environment</a> for more information.</p>
<p>The details of the linking process vary, and order sometimes matters. Much depends on the type of library: static (<code>.a</code> suffix; library's binary code becomes part of executable image at link time) versus dynamically-linked shared (.so suffix; library's binary code is not part of executable; it's located and loaded into memory at run time). The link line can use rpath to store in the executable an explicit path to a shared library. In general, however, the <code>LD_LIBRARY_PATH</code> environment variable specifies the search path for dynamic libraries. For software installed at the system-level, TACC's modules generally modify <code>LD_LIBRARY_PATH</code> automatically. To see whether and how an executable named "<code>myexe</code>" resolves dependencies on dynamically linked libraries, execute "<code>ldd myexe</code>".</p>
<p>A separate section below addresses the <a href="#intel-math-kernel-library-mkl">Intel Math Kernel Library</a> (MKL).</p>
</div>
<div id="building-basics-mpi">
<h3 id="compiling-and-linking-mpi-programs"><a href="#building-basics-mpi">Compiling and Linking MPI Programs</a></h3>
<p>Intel MPI (module <code>impi</code>) and MVAPICH2 (module <code>mvapich2</code>) are the two MPI libraries available on Stampede2. After loading an <code>impi</code> or <code>mvapich2</code> module, compile and/or link using an mpi wrapper (<code>mpicc</code>, <code>mpicxx</code>, <code>mpif90</code>) in place of the compiler:</p>
<pre class="cmd-line">
$ <b>mpicc mycode.c -o myexe</b> # C source, full build
$ <b>mpicc -c mycode.c</b> # C source, compile without linking
$ <b>mpicxx mycode.cpp -o myexe</b> # C++ source, full build
$ <b>mpif90 mycode.f90 -o myexe</b> # Fortran source, full build</pre>
<p>These wrappers call the compiler with the options, include paths, and libraries necessary to produce an MPI executable using the MPI module you're using. To see the effect of a given wrapper, call it with the "<code>-show</code>" option:</p>
<pre class="cmd-line">$ <b>mpicc -show</b> # Show compile line generated by call to mpicc; similarly for other wrappers</pre>
</div>
<div id="building-basics-thirdparty">
<h3 id="building-third-party-software-in-your-own-account"><a href="#building-basics-thirdparty">Building Third-Party Software in Your Own Account</a></h3>
<p>You're welcome to download third-party research software and install it in your own account. In most cases you'll want to download the source code and build the software so it's compatible with the Stampede2 software environment. You can't use yum or any other installation process that requires elevated privileges, but this is almost never necessary. The key is to specify an installation directory for which you have write permissions. Details vary; you should consult the package's documentation and be prepared to experiment. When using the famous <a href="http://www.gnu.org/software/automake/manual/html_node/Autotools-Introduction.html">three-step autotools</a> build process, the standard approach is to use the <code>PREFIX</code> environment variable to specify a non-default, user-owned installation directory at the time you execute <code>configure</code> or <code>make</code>:</p>
<pre class="cmd-line">
$ <b>export INSTALLDIR=$WORK/apps/t3pio</b>
$ <b>./configure --prefix=$INSTALLDIR</b>
$ <b>make</b>
$ <b>make install</b></pre>
<p>Other languages, frameworks, and build systems generally have equivalent mechanisms for installing software in user space. In most cases a web search like "Python Linux install local" will get you the information you need.</p>
<p>In Python, a local install will resemble one of the following examples:</p>
<pre class="cmd-line">
$ <b>pip install netCDF4 --user</b> # install netCDF4 package to $HOME/.local
$ <b>python setup.py install --user</b> # install to $HOME/.local
$ <b>pip install netCDF4 --prefix=$INSTALLDIR</b> # custom location; add to PYTHONPATH</pre>
<p>Similarly in R:</p>
<pre class="cmd-line">
$ <b>module load Rstats</b> # load TACC's default R
$ <b>R</b> # launch R
&gt; <b>install.packages('devtools')</b> # R will prompt for install location</pre>
<p>You may, of course, need to customize the build process in other ways. It's likely, for example, that you'll need to edit a <code>makefile</code> or other build artifacts to specify Stampede2-specific <a href="#building-basics-inclib">include and library paths</a> or other compiler settings. A good way to proceed is to write a shell script that implements the entire process: definitions of environment variables, module commands, and calls to the build utilities. Include <code>echo</code> statements with appropriate diagnostics. Run the script until you encounter an error. Research and fix the current problem. Document your experience in the script itself; including dead-ends, alternatives, and lessons learned. Re-run the script to get to the next error, then repeat until done. When you're finished, you'll have a repeatable process that you can archive until it's time to update the software or move to a new machine.</p>
<p>If you wish to share a software package with collaborators, you may need to modify file permissions. See <a href="http://portal.tacc.utexas.edu/tutorials/sharing-project-files">Sharing Files with Collaborators</a> for more information.</p>
</div> <!-- Intel MKL -->
<div id="mkl">
<h2 id="intel-math-kernel-library-mkl"><a href="#mkl">Intel Math Kernel Library (MKL)</a></h2>
<p>The <a href="http://software.intel.com/intel-mkl">Intel Math Kernel Library</a> (MKL) is a collection of highly optimized functions implementing some of the most important mathematical kernels used in computational science, including standardized interfaces to:</p>
<ul>
<li><a href="http://netlib.org/blas">BLAS</a> (Basic Linear Algebra Subroutines), a collection of low-level matrix and vector operations like matrix-matrix multiplication</li>
<li><a href="http://netlib.org/lapack">LAPACK</a> (Linear Algebra PACKage), which includes higher-level linear algebra algorithms like Gaussian Elimination</li>
<li>FFT (Fast Fourier Transform), including interfaces based on <a href="http://fftw.org">FFTW</a> (Fastest Fourier Transform in the West)</li>
<li><a href="http://netlib.org/scalapack">ScaLAPACK</a> (Scalable LAPACK), <a href="http://netlib.org/blacs">BLACS</a> (Basic Linear Algebra Communication Subprograms), Cluster FFT, and other functionality that provide block-based distributed memory (multi-node) versions of selected <a href="https://software.intel.com/en-us/mkl-developer-reference-c-lapack-routines">LAPACK</a>, <a href="https://software.intel.com/en-us/mkl-developer-reference-c-blas-and-sparse-blas-routines">BLAS</a>, and <a href="https://software.intel.com/en-us/mkl-developer-reference-c-fft-functions">FFT</a> algorithms;</li>
<li><a href="http://software.intel.com/en-us/node/521751">Vector Mathematics</a> (VM) functions that implement highly optimized and vectorized versions of special functions like sine and square root.</li>
</ul>
</div>
<div id="mkl-intel">
<h3 id="mkl-with-intel-c-c-and-fortran-compilers"><a href="#mkl-intel">MKL with Intel C, C++, and Fortran Compilers</a></h3>
<p>There is no MKL module for the Intel compilers because you don't need one: the Intel compilers have built-in support for MKL. Unless you have specialized needs, there is no need to specify include paths and libraries explicitly. Instead, using MKL with the Intel modules requires nothing more than compiling and linking with the <span style="white-space: nowrap;">"<code>-mkl</code>"</span> option.; e.g.</p>
<pre class="cmd-line">
$ <b>icc -mkl mycode.c</b>
$ <b>ifort -mkl mycode.c</b></pre>
<p>The "<code>-mkl</code>" switch is an abbreviated form of <span style="white-space: nowrap;">"<code>-mkl=parallel</code>"</span>, which links your code to the threaded version of MKL. To link to the unthreaded version, use <span style="white-space: nowrap;">"<code>-mkl=sequential</code>"</span>. A third option, <span style="white-space: nowrap;">"<code>-mkl=cluster</code>"</span>, which also links to the unthreaded libraries, is necessary and appropriate only when using ScaLAPACK or other distributed memory packages. For additional information, including advanced linking options, see the <a href="http://software.intel.com/intel-mkl">MKL documentation</a> and <a href="http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor">Intel MKL Link Line Advisor</a>.</p>
</div>
<div id="mkl-gnu">
<h3 id="mkl-with-gnu-c-c-and-fortran-compilers"><a href="#mkl-gnu">MKL with GNU C, C++, and Fortran Compilers</a></h3>
<p>When using a GNU compiler, load the MKL module before compiling or running your code, then specify explicitly the MKL libraries, library paths, and include paths your application needs. Consult the <a href="http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor">Intel MKL Link Line Advisor</a> for details. A typical compile/link process on a TACC system will look like this:</p>
<pre class="cmd-line">
$ <b>module load gcc</b>
$ <b>module load mkl</b> # available/needed only for GNU compilers
$ <b>gcc -fopenmp -I$MKLROOT/include \
-Wl,-L${MKLROOT}/lib/intel64 \
-lmkl_intel_lp64 -lmkl_core \
-lmkl_gnu_thread -lpthread \
-lm -ldl mycode.c</b></pre>
<p>For your convenience the <code>mkl</code> module file also provides alternative TACC-defined variables like <code>$TACC_MKL_INCLUDE</code> (equivalent to <code>$MKLROOT/include</code>). Execute "<code>module help mkl</code>" for more information.</p>
</div>
<div id="mkl-thirdparty">
<h3 id="using-mkl-as-blaslapack-with-third-party-software"><a href="#mkl-thirdparty">Using MKL as BLAS/LAPACK with Third-Party Software</a></h3>
<p>When your third-party software requires BLAS or LAPACK, you can use MKL to supply this functionality. Replace generic instructions that include link options like <span style="white-space: nowrap;">"<code>-lblas</code>"</span> or <span style="white-space: nowrap;">"<code>-llapack</code>"</span> with the simpler MKL approach described above. There is no need to download and install alternatives like OpenBLAS.</p>
</div>
<div id="mkl-tacc">
<h3 id="using-mkl-as-blaslapack-with-taccs-matlab-python-and-r-modules"><a href="#mkl-tacc">Using MKL as BLAS/LAPACK with TACC's MATLAB, Python, and R Modules</a></h3>
<p>TACC's MATLAB, Python, and R modules all use threaded (parallel) MKL as their underlying BLAS/LAPACK library. These means that even serial codes written in MATLAB, Python, or R may benefit from MKL's thread-based parallelism. This requires no action on your part other than specifying an appropriate max thread count for MKL; see the section below for more information.</p>
</div>
<div id="mkl-threading">
<h3 id="controlling-threading-in-mkl"><a href="#mkl-threading">Controlling Threading in MKL</a></h3>
<p>Any code that calls MKL functions can potentially benefit from MKL's thread-based parallelism; this is true even if your code is not otherwise a parallel application. If you are linking to the threaded MKL (using <span style="white-space: nowrap;">"<code>-mkl</code>"</span>, <span style="white-space: nowrap;">"<code>-mkl=parallel</code>"</span>, or the equivalent explicit link line), you need only specify an appropriate value for the max number of threads available to MKL. You can do this with either of the two environment variables <code>MKL_NUM_THREADS</code> or <code>OMP_NUM_THREADS</code>. The environment variable <code>MKL_NUM_THREADS</code> specifies the max number of threads available to each instance of MKL, and has no effect on non-MKL code. If <code>MKL_NUM_THREADS</code> is undefined, MKL uses <code>OMP_NUM_THREADS</code> to determine the max number of threads available to MKL functions. In either case, MKL will attempt to choose an optimal thread count less than or equal to the specified value. Note that <code>OMP_NUM_THREADS</code> defaults to 1 on TACC systems; if you use the default value you will get no thread-based parallelism from MKL.</p>
<p>If you are running a single serial, unthreaded application (or an unthreaded MPI code involving a single MPI task per node) it is usually best to give MKL as much flexibility as possible by setting the max thread count to the total number of hardware threads on the node (272 on KNL, 96 on SKX). Of course things are more complicated if you are running more than one process on a node: e.g. multiple serial processes, threaded applications, hybrid MPI-threaded applications, or pure MPI codes running more than one MPI rank per node. See <a href="http://software.intel.com/en-us/articles/recommended-settings-for-calling-intel-mkl-routines-from-multi-threaded-applications" class="uri">http://software.intel.com/en-us/articles/recommended-settings-for-calling-intel-mkl-routines-from-multi-threaded-applications</a> and related Intel resources for examples of how to manage threading when calling MKL from multiple processes.</p>
</div>
<div id="mkl-othercapabilities">
<h3 id="using-scalapack-cluster-fft-and-other-mkl-cluster-capabilities"><a href="#mkl-othercapabilities">Using ScaLAPACK, Cluster FFT, and Other MKL Cluster Capabilities</a></h3>
<p>See "<a href="https://software.intel.com/en-us/mkl-linux-developer-guide-working-with-the-intel-math-kernel-library-cluster-software">Working with the Intel Math Kernel Library Cluster Software</a>" and "<a href="http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor">Intel MKL Link Line Advisor</a>" for information on linking to the MKL cluster components.</p>
</div>
</div>
<div id="building-performance">
<h2 id="building-for-performance-on-stampede2"><a href="#building-performance">Building for Performance on Stampede2</a></h2>
<div id="building-performance-compiler">
<h3 id="compiler"><a href="#building-performance-compiler">Compiler</a></h3>
<p>When building software on Stampede2, we recommend using the most recent Intel compiler and Intel MPI library available on Stampede2. The most recent versions may be newer than the defaults. Execute <span style="white-space: nowrap;">"<code>module spider intel</code>"</span> and <span style="white-space: nowrap;">"<code>module spider impi</code>"</span> to see what's installed. When loading these modules you may need to specify version numbers explicitly (e.g. <span style="white-space: nowrap;">"<code>module load intel/18.0.0</code>"</span> and <span style="white-space: nowrap;">"<code>module load impi/18.0.0</code>"</span>).</p>
</div>
<div id="building-performance-architecture">
<h3 id="architecture-specific-flags"><a href="#building-performance-architecture">Architecture-Specific Flags</a></h3>
<p>To compile for KNL only, include "<code>-xMIC-AVX512</code>" as a build option. The "<code>-x</code>" switch allows you to specify a <a href="https://software.intel.com/en-us/fortran-compiler-18.0-developer-guide-and-reference-x-qx">target architecture</a>, while <code>MIC-AVX512</code> is the KNL-specific subset of Intel's Advanced Vector Extensions 512-bit <a href="https://software.intel.com/en-us/articles/performance-tools-for-software-developers-intel-compiler-options-for-sse-generation-and-processor-specific-optimizations">instruction set</a>. Besides all other appropriate compiler options, you should also consider specifying an <a href="https://software.intel.com/en-us/fortran-compiler-18.0-developer-guide-and-reference-o">optimization level</a> using the "<code>-O</code>" flag:</p>
<pre class="cmd-line">$ <b>icc -xMIC-AVX512 -O3 mycode.c -o myexe</b> # will run only on KNL</pre>
<p>Similarly, to build for SKX only, specify the <code>CORE-AVX512</code> instruction set, which is native to SKX:</p>
<pre class="cmd-line">$ <b>ifort -xCORE-AVX512 -O3 mycode.f90 -o myexe</b> # will run only on SKX</pre>
<p>Because Stampede2 has two kinds of compute nodes, however, we recommend a more flexible approach when building with the Intel compiler: use <a href="https://software.intel.com/en-us/articles/performance-tools-for-software-developers-sse-generation-and-processor-specific-optimizations-continue#1">CPU dispatch</a> to build a multi-architecture ("fat") binary that contains alternate code paths with optimized vector code for each type of Stampede2 node. To produce a multi-architecture binary for Stampede2, build with the following options:</p>
<pre><code>-xCORE-AVX2 -axCORE-AVX512,MIC-AVX512</code></pre>
<p>These particular choices allow you to build on any Stampede2 node (login node, KNL compute node, SKX compute node), and use <a href="https://software.intel.com/en-us/articles/performance-tools-for-software-developers-sse-generation-and-processor-specific-optimizations-continue#1">CPU dispatch</a> to produce a multi-architecture binary. We recommend that you specify these flags in both the compile and link steps. Specify an optimization level (e.g. "<code>-O3</code>") along with any other appropriate compiler switches:</p>
<pre class="cmd-line">$ <b>icc -xCORE-AVX2 -axCORE-AVX512,MIC-AVX512 -O3 mycode.c -o myexe</b></pre>
<p>The "<code>-x</code>" option is the target base architecture (instruction set). The base instruction set must run on all targeted processors. Here we specify <span style="white-space: nowrap;"><code>CORE-AVX2</code></span>, which is native for older Broadwell processors and supported on both KNL and SKX. This option allows configure scripts and similar build systems to run test executables on any Stampede2 login or compute node. The "<code>-ax</code>" option is a comma-separated list of alternate instruction sets: <span style="white-space: nowrap;"><code>CORE-AVX512</code></span> for SKX, and <span style="white-space: nowrap;"><code>MIC-AVX512</code></span> for KNL.</p>
<p>Now that we have replaced the original Broadwell login nodes with newer Skylake login nodes, <span style="white-space: nowrap;">"<code>-xCORE-AVX2</code>"</span> remains a reasonable (though conservative) base option. Another plausible, more aggressive base option is <span style="white-space: nowrap;">"<code>-xCOMMON-AVX512</code>"</span>, which is a subset of <code>AVX512</code> that runs on both SKX and KNL.</p>
<p><strong>It's best to avoid building with "<code>-xHost</code>"</strong> (a flag that means "optimize for the architecture on which I'm compiling now"). Using "<code>-xHost</code>" on a SKX login node, for example, will result in a binary that won't run on KNL.</p>
<p>Don't skip the "<code>-x</code>" flag in a multi-architecture build: the default is the very old SSE2 (Pentium 4) instruction set. <strong>Don't create a multi-architecture build with a base option of either <span style="white-space: nowrap;">"<code>-xMIC-AVX512</code>"</span> (native on KNL) or <span style="white-space: nowrap;">"<code>-xCORE-AVX512</code>"</span> (native on SKX);</strong> there are no meaningful, compatible alternate ("<code>-ax</code>") instruction sets:</p>
<pre class="cmd-line">$ <b>icc <s>-xCORE-AVX512 -axMIC-AVX512 -O3 mycode.c -o myexe</s></b> # NO! Base incompatible with alternate</pre>
<p>On Stampede2, the module files for newer Intel compilers (Intel 18.0.0 and later) define the environment variable <code>TACC_VEC_FLAGS</code> that stores the recommended architecture flags described above. This can simplify your builds:</p>
<pre class="cmd-line">$ <b>echo $TACC_VEC_FLAGS</b>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;# env variable available only for intel/18.0.0 and later
-xCORE-AVX2 -axCORE-AVX512,MIC-AVX512
$ <b>icc $TACC_VEC_FLAGS -O3 mycode.c -o myexe</b></pre>
<p>Simplicity is a major advantage of this multi-architecture approach: it allows you to build and run anywhere on Stampede2, and performance is generally comparable to single-architecture builds. There are some trade-offs to consider, however. This approach will take a little longer to compile than single-architecture builds, and will produce a larger binary. In some cases, you might also pay a small performance penalty over single-architecture approaches. For more information see the <a href="https://software.intel.com/en-us/articles/performance-tools-for-software-developers-intel-compiler-options-for-sse-generation-and-processor-specific-optimizations">Intel documentation</a>.</p>
<p>For information on the performance implications of your choice of build flags, see the sections on Programming and Performance for <a href="#programming-and-performance-knl">KNL</a> and <a href="#programming-and-performance-skx">SKX</a> respectively.</p>
<p>If you use GNU compilers, see <a href="https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html">GNU x86 Options</a> for information regarding support for KNL and SKX. Note that GNU compilers do not support multi-architecture binaries.</p>
</div>
</div>
</div>
<div id="running">
<h1 id="running-jobs-on-the-stampede2-compute-nodes"><a href="#running">Running Jobs on the Stampede2 Compute Nodes</a></h1> <!-- taccinfo blurb -->
<div id="running-accounting">
<h2 id="job-accounting"><a href="#running-accounting">Job Accounting</a></h2>
<p>Like all TACC systems, Stampede2's accounting system is based on node-hours: one unadjusted Service Unit (SU) represents a single compute node used for one hour (a node-hour). For any given job, the total cost in SUs is the use of one compute node for one hour of wall clock time plus any additional charges for the use of specialized queues, e.g. Stampede2's <code>largemem</code> queue, Lonestar5's <code>gpu</code> queue and Longhorn's <code>v100</code> queue. The <a href="#queues">queue charge rates</a> are determined by the supply and demand for that particular queue or type of node used.</p>
<p><span style="white-space: nowrap;"><b>Stampede2 SUs billed = (# nodes) x (job duration in wall clock hours) x (charge rate per node-hour)</b></span></p>
<p>The Slurm scheduler tracks and charges for usage to a granularity of a few seconds of wall clock time. <strong>The system charges only for the resources you actually use, not those you request.</strong> If your job finishes early and exits properly, Slurm will release the nodes back into the pool of available nodes. Your job will only be charged for as long as you are using the nodes.</p>
<p class="portlet-msg-info"> TACC does not implement node-sharing on any compute resource. Each Stampede2 node can be assigned to only one user at a time; hence a complete node is dedicated to a user's job and accrues wall-clock time for all the node's cores whether or not all cores are used. </p>
<p><strong>Tip</strong>: Your queue wait times will be less if you request only the time you need: the scheduler will have a much easier time finding a slot for the 2 hours you really need than say, for the 12 hours requested in your job script.</p>
<p>Principal Investigators can monitor allocation usage via the <a href="https://portal.tacc.utexas.edu">TACC User Portal</a> under <a href="https://portal.tacc.utexas.edu/projects-and-allocations">"Allocations-&gt;Projects and Allocations"</a>. Be aware that the figures shown on the portal may lag behind the most recent usage. Projects and allocation balances are also displayed upon command-line login.</p>
<p class="portlet-msg-info">To display a summary of your TACC project balances and disk quotas at any time, execute:<br><br><code>login1$ <b>/usr/local/etc/taccinfo</b>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;# Generally more current than balances displayed on the portals.</code> </p>
</div>
<div id="running-slurm">
<h2 id="slurm-job-scheduler"><a href="#running-slurm">Slurm Job Scheduler</a></h2>
<p>Stampede2's job scheduler is the <a href="http://schedmd.com">Slurm Workload Manager</a>. Slurm commands enable you to submit, manage, monitor, and control your jobs.</p>
</div>
<div id="running-queues">
<h2 id="slurm-partitions-queues"><a href="#running-queues">Slurm Partitions (Queues)</a></h2>
<p>Currently available queues include those in <a href="#table5">Stampede2 Production Queues</a>. See <a href="#overview-phase1computenodes">KNL Compute Nodes</a>, <a href="#overview-skxcomputenodes">SKX Compute Nodes</a>, <a href="#programming-knl-memorymodes">Memory Modes</a>, and <a href="#programming-knl-clustermodes">Cluster Modes</a> for more information on node types.</p>
</div>
<div id="queues"></div>
<div id="table5">
<p><a href="#table5">Table 5. Stampede2 Production Queues</a></p>
</div>
<table border="1" cellpadding="3">
<tbody>
<tr align="center">
<th align="center">Queue Name</th>
<th align="center">Node Type</th>
<th align="center">Max Nodes per Job<br> (assoc'd cores)*</th>
<th align="center">Max Duration</th>
<th align="center">Max Jobs in Queue*</th>
<th align="center">Charge Rate<br> (per node-hour)</th>
</tr>
<tr align="center">
<td><code>development</code></td>
<td>KNL cache-quadrant</td>
<td>16 nodes<br> (1,088 cores)*</td>
<td>2 hrs</td>
<td>1*</td>
<td>0.8 Service Unit (SU)</td>
</tr>
<tr align="center">
<td><code>normal</code></td>
<td>KNL cache-quadrant</td>
<td>256 nodes<br> (17,408 cores)*</td>
<td>48 hrs</td>
<td>50*</td>
<td>0.8 SU</td>
</tr>
<tr align="center">
<td><code>large</code>**</td>
<td>KNL cache-quadrant</td>
<td>2048 nodes<br> (139,264 cores)*</td>
<td>48 hrs</td>
<td>5*</td>
<td>0.8 SU</td>
</tr>
<tr align="center">
<td><code>long</code></td>
<td>KNL cache-quadrant</td>
<td>32 nodes<br>(2,176 cores)*</td>
<td>120 hrs</td>
<td>2*</td>
<td>0.8 SU</td>
</tr>
<tr align="center">
<td><code>flat-quadrant</code></td>
<td>KNL flat-quadrant</td>
<td>32 nodes<br> (2,176 cores)*</td>
<td>48 hrs</td>
<td>5*</td>
<td>0.8 SU</td>
</tr>
<tr align="center">
<td><code>skx-dev</code></td>
<td>SKX</td>
<td>4 nodes<br>(192 cores)*</td>
<td>2 hrs</td>
<td>1*</td>
<td>1 SU</td>
</tr>
<tr align="center">
<td><code>skx-normal</code></td>
<td>SKX</td>
<td>128 nodes<br>(6,144 cores)*</td>
<td>48 hrs</td>
<td>25*</td>
<td>1 SU</td>
</tr>
<tr align="center">
<td><code>skx-large</code>**</td>
<td>SKX</td>
<td>868 nodes<br>(41,664 cores)*</td>
<td>48 hrs</td>
<td>3*</td>
<td>1 SU</td>
</tr>
</tbody>
</table>
<p> </p>
<p>* Queue status as of January 14, 2019. <strong>Queues and limits are subject to change without notice.</strong> Execute "<code>qlimits</code>" on Stampede2 for real-time information regarding limits on available queues. See <a href="#monitoring">Monitoring Jobs and Queues</a> for additional information.</p>
<p>** To request more nodes than are available in the normal queue, submit a consulting (help desk) ticket through the <a href="http://portal.tacc.utexas.edu/">TACC</a> or <a href="http://portal.xsede.org/">XSEDE</a> user portal. Include in your request reasonable evidence of your readiness to run under the conditions you're requesting. In most cases this should include your own strong or weak scaling results from Stampede2.</p>
<p>*** For non-hybrid memory-cluster modes or other special requirements, submit a ticket through the <a href="http://portal.tacc.utexas.edu/">TACC</a> or <a href="http://portal.xsede.org/">XSEDE</a> user portal.</p>
<div id="running-sbatch">
<h2 id="submitting-batch-jobs-with-sbatch"><a href="#running-sbatch">Submitting Batch Jobs with <code>sbatch</code></a></h2>
<p>Use Slurm's "<code>sbatch</code>" command to <a href="#using-computenodes">submit a batch job</a> to one of the Stampede2 queues:</p>
<pre class="cmd-line">login1$ <b>sbatch myjobscript</b></pre>
<p>Here "<code>myjobscript</code>" is the name of a text file containing <code>#SBATCH</code> directives and shell commands that describe the particulars of the job you are submitting. The details of your job script's contents depend on the type of job you intend to run.</p>
<p>In your job script you (1) use <code>#SBATCH</code> directives to request computing resources (e.g. 10 nodes for 2 hrs); and then (2) use shell commands to specify what work you're going to do once your job begins. There are many possibilities: you might elect to launch a single application, or you might want to accomplish several steps in a workflow. You may even choose to launch more than one application at the same time. The details will vary, and there are many possibilities. But your own job script will probably include at least one launch line that is a variation of one of the examples described here.</p>
<div id="running-sbatch-jobscripts">
<h3 id="job-scripts"><a href="#running-sbatch-jobscripts">Job Scripts</a></h3>
</div>
</div>
</div>
<center>
<table border="1" cellpadding="3" cellspacing="1">
<tbody>
<tr valign="top">
<td><a href="javascript:showhideknlserial()"><img src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000" id="img-knlserial">KNL Serial Job in Normal Queue</a><br>
<div id="knlserial" style="display:none">
<pre>
#!/bin/bash
#----------------------------------------------------
# Sample Slurm job script
# for TACC Stampede2 KNL nodes
#
# *** Serial Job on Normal Queue ***
#
# Last revised: 20 Oct 2017
#
# Notes:
#
# -- Copy/edit this script as desired. Launch by executing
# "sbatch knl.serial.slurm" on a Stampede2 login node.
#
# -- Serial codes run on a single node (upper case N = 1).
# A serial code ignores the value of lower case n,
# but slurm needs a plausible value to schedule the job.
#
# -- For a good way to run multiple serial executables at the
# same time, execute "module load launcher" followed
# by "module help launcher".
#----------------------------------------------------
#SBATCH -J myjob # Job name
#SBATCH -o myjob.o%j # Name of stdout output file
#SBATCH -e myjob.e%j # Name of stderr error file
#SBATCH -p normal # Queue (partition) name
#SBATCH -N 1 # Total # of nodes (must be 1 for serial)
#SBATCH -n 1 # Total # of mpi tasks (should be 1 for serial)
#SBATCH -t 01:30:00 # Run time (hh:mm:ss)
#SBATCH --mail-user=username@tacc.utexas.edu
#SBATCH --mail-type=all # Send email at begin and end of job
#SBATCH -A myproject # Allocation name (req'd if you have more than 1)
# Other commands must follow all #SBATCH directives...
module list
pwd
date
# Launch serial code...
./mycode.exe # Do not use ibrun or any other MPI launcher
# ---------------------------------------------------</pre>
</div></td>
<td valign="top"><a href="javascript:showhideskxserial()"><img src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000" id="img-skxserial">SKX Serial Job in Normal Queue</a><br>
<div id="skxserial" style="display:none">
<pre>
#!/bin/bash
#----------------------------------------------------
# Sample Slurm job script
# for TACC Stampede2 SKX nodes
#
# *** Serial Job on SKX Normal Queue ***
#
# Last revised: 20 Oct 2017
#
# Notes:
#
# -- Copy/edit this script as desired. Launch by executing
# "sbatch skx.serial.slurm" on a Stampede2 login node.
#
# -- Serial codes run on a single node (upper case N = 1).
# A serial code ignores the value of lower case n,
# but slurm needs a plausible value to schedule the job.
#
# -- For a good way to run multiple serial executables at the
# same time, execute "module load launcher" followed
# by "module help launcher".
#----------------------------------------------------
#SBATCH -J myjob # Job name
#SBATCH -o myjob.o%j # Name of stdout output file
#SBATCH -e myjob.e%j # Name of stderr error file
#SBATCH -p skx-normal # Queue (partition) name
#SBATCH -N 1 # Total # of nodes (must be 1 for serial)
#SBATCH -n 1 # Total # of mpi tasks (should be 1 for serial)
#SBATCH -t 01:30:00 # Run time (hh:mm:ss)
#SBATCH --mail-user=username@tacc.utexas.edu
#SBATCH --mail-type=all # Send email at begin and end of job
#SBATCH -A myproject # Allocation name (req'd if you have more than 1)
# Other commands must follow all #SBATCH directives...
module list
pwd
date
# Launch serial code...
./mycode.exe # Do not use ibrun or any other MPI launcher
# ---------------------------------------------------</pre>
</div></td>
</tr>
<tr>
<td valign="top"><a href="javascript:showhideknlmpi()"><img src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000" id="img-knlmpi">KNL MPI Job in Normal Queue</a>
<div id="knlmpi" style="display:none">
<pre>
#!/bin/bash
#----------------------------------------------------
# Sample Slurm job script
# for TACC Stampede2 KNL nodes
#
# *** MPI Job on Normal Queue ***
#
# Last revised: 20 Oct 2017
#
# Notes:
#
# -- Launch this script by executing
# "sbatch knl.mpi.slurm" on Stampede2 login node.
#
# -- Use ibrun to launch MPI codes on TACC systems.
# Do not use mpirun or mpiexec.
#
# -- Max recommended MPI tasks per KNL node: 64-68
# (start small, increase gradually).
#
# -- If you're running out of memory, try running
# fewer tasks per node to give each task more memory.
#
#----------------------------------------------------
#SBATCH -J myjob # Job name
#SBATCH -o myjob.o%j # Name of stdout output file
#SBATCH -e myjob.e%j # Name of stderr error file
#SBATCH -p normal # Queue (partition) name
#SBATCH -N 4 # Total # of nodes
#SBATCH -n 32 # Total # of mpi tasks
#SBATCH -t 01:30:00 # Run time (hh:mm:ss)
#SBATCH --mail-user=username@tacc.utexas.edu
#SBATCH --mail-type=all # Send email at begin and end of job
#SBATCH -A myproject # Allocation name (req'd if you have more than 1)
# Other commands must follow all #SBATCH directives...
module list
pwd
date
# Launch MPI code...
ibrun ./mycode.exe # Use ibrun instead of mpirun or mpiexec
# ---------------------------------------------------</pre>
</div></td>
<td valign="top"><a href="javascript:showhideskxmpi()"><img src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000" id="img-skxmpi">SKX MPI Job in Normal Queue</a>
<div id="skxmpi" style="display:none">
<pre>
#!/bin/bash
#----------------------------------------------------
# Sample Slurm job script
# for TACC Stampede2 SKX nodes
#
# *** MPI Job on SKX Normal Queue ***
#
# Last revised: 20 Oct 2017
#
# Notes:
#
# -- Launch this script by executing
# "sbatch skx.mpi.slurm" on Stampede2 login node.
#
# -- Use ibrun to launch MPI codes on TACC systems.
# Do not use mpirun or mpiexec.
#
# -- Max recommended MPI ranks per SKX node: 48
# (start small, increase gradually).
#
# -- If you're running out of memory, try running
# fewer tasks per node to give each task more memory.
#
#----------------------------------------------------
#SBATCH -J myjob # Job name
#SBATCH -o myjob.o%j # Name of stdout output file
#SBATCH -e myjob.e%j # Name of stderr error file
#SBATCH -p skx-normal # Queue (partition) name
#SBATCH -N 4 # Total # of nodes
#SBATCH -n 32 # Total # of mpi tasks
#SBATCH -t 01:30:00 # Run time (hh:mm:ss)
#SBATCH --mail-user=username@tacc.utexas.edu
#SBATCH --mail-type=all # Send email at begin and end of job
#SBATCH -A myproject # Allocation name (req'd if you have more than 1)
# Other commands must follow all #SBATCH directives...
module list
pwd
date
# Launch MPI code...
ibrun ./mycode.exe # Use ibrun instead of mpirun or mpiexec
# ---------------------------------------------------</pre>
</div></td>
</tr>
<tr valign="top">
<td><a href="javascript:showhideknlopenmp()"><img src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000" id="img-knlopenmp">KNL OpenMP Job in Normal Queue</a>
<div id="knlopenmp" style="display:none">
<pre>
#!/bin/bash
#----------------------------------------------------
# Sample Slurm job script
# for TACC Stampede2 KNL nodes
#
# *** OpenMP Job on Normal Queue ***
#
# Last revised: 20 Oct 2017
#
# Notes:
#
# -- Launch this script by executing
# -- Copy/edit this script as desired. Launch by executing
# "sbatch knl.openmp.slurm" on a Stampede2 login node.
#
# -- OpenMP codes run on a single node (upper case N = 1).
# OpenMP ignores the value of lower case n,
# but slurm needs a plausible value to schedule the job.
#
# -- Default value of OMP_NUM_THREADS is 1; be sure to change it!
#
# -- Increase thread count gradually while looking for optimal setting.
# If there is sufficient memory available, the optimal setting
# is often 68 (1 thread per core) or 136 (2 threads per core).
#----------------------------------------------------
#SBATCH -J myjob # Job name
#SBATCH -o myjob.o%j # Name of stdout output file
#SBATCH -e myjob.e%j # Name of stderr error file
#SBATCH -p normal # Queue (partition) name
#SBATCH -N 1 # Total # of nodes (must be 1 for OpenMP)
#SBATCH -n 1 # Total # of mpi tasks (should be 1 for OpenMP)
#SBATCH -t 01:30:00 # Run time (hh:mm:ss)
#SBATCH --mail-user=username@tacc.utexas.edu
#SBATCH --mail-type=all # Send email at begin and end of job
#SBATCH -A myproject # Allocation name (req'd if you have more than 1)
# Other commands must follow all #SBATCH directives...
module list
pwd
date
# Set thread count (default value is 1)...
export OMP_NUM_THREADS=34
# Launch OpenMP code...
./mycode.exe # Do not use ibrun or any other MPI launcher
# ---------------------------------------------------</pre>
</div></td>
<td valign="top"><a href="javascript:showhideskxopenmp()"><img src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000" id="img-skxopenmp">SKX OpenMP Job in Normal Queue</a>
<div id="skxopenmp" style="display:none">
<pre>
#!/bin/bash
#----------------------------------------------------
# Sample Slurm job script
# for TACC Stampede2 SKX nodes
#
# *** OpenMP Job on SKX Normal Queue ***
#
# Last revised: 20 Oct 2017
#
# Notes:
#
# -- Launch this script by executing
# -- Copy/edit this script as desired. Launch by executing
# "sbatch skx.openmp.slurm" on a Stampede2 login node.
#
# -- OpenMP codes run on a single node (upper case N = 1).
# OpenMP ignores the value of lower case n,
# but slurm needs a plausible value to schedule the job.
#
# -- Default value of OMP_NUM_THREADS is 1; be sure to change it!
#
# -- Increase thread count gradually while looking for optimal setting.
# If there is sufficient memory available, the optimal setting
# is often 48 (1 thread per core) but may be higher.
#----------------------------------------------------
#SBATCH -J myjob # Job name
#SBATCH -o myjob.o%j # Name of stdout output file
#SBATCH -e myjob.e%j # Name of stderr error file
#SBATCH -p skx-normal # Queue (partition) name
#SBATCH -N 1 # Total # of nodes (must be 1 for OpenMP)
#SBATCH -n 1 # Total # of mpi tasks (should be 1 for OpenMP)
#SBATCH -t 01:30:00 # Run time (hh:mm:ss)
#SBATCH --mail-user=username@tacc.utexas.edu
#SBATCH --mail-type=all # Send email at begin and end of job
#SBATCH -A myproject # Allocation name (req'd if you have more than 1)
# Other commands must follow all #SBATCH directives...
module list
pwd
date
# Set thread count (default value is 1)...
export OMP_NUM_THREADS=48 # this is 1 thread/core; may want to start lower
# Launch OpenMP code...
./mycode.exe # Do not use ibrun or any other MPI launcher
# ---------------------------------------------------</pre>
</div></td>
</tr>
<tr valign="top">
<td><a href="javascript:showhideknlhybrid()"><img src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000" id="img-knlhybrid">KNL Hybrid Job in Normal Queue</a>
<div id="knlhybrid" style="display:none">
<pre>
#!/bin/bash
#----------------------------------------------------
# Example Slurm job script
# for TACC Stampede2 KNL nodes
#
# *** Hybrid Job on Normal Queue ***
#
# This sample script specifies:
# 10 nodes (capital N)
# 40 total MPI tasks (lower case n); this is 4 tasks/node
# 16 OpenMP threads per MPI task (64 threads per node)
#
# Last revised: 20 Oct 2017
#
# Notes:
#
# -- Launch this script by executing
# "sbatch knl.hybrid.slurm" on Stampede2 login node.
#
# -- Use ibrun to launch MPI codes on TACC systems.
# Do not use mpirun or mpiexec.
#
# -- In most cases it's best to specify no more
# than 64-68 MPI ranks or independent processes
# per node, and 1-2 threads/core.
#
# -- If you're running out of memory, try running
# fewer tasks and/or threads per node to give each
# process access to more memory.
#
# -- IMPI and MVAPICH2 both do sensible process pinning by default.
#
#----------------------------------------------------
#SBATCH -J myjob # Job name
#SBATCH -o myjob.o%j # Name of stdout output file
#SBATCH -e myjob.e%j # Name of stderr error file
#SBATCH -p normal # Queue (partition) name
#SBATCH -N 10 # Total # of nodes
#SBATCH -n 40 # Total # of mpi tasks
#SBATCH -t 01:30:00 # Run time (hh:mm:ss)
#SBATCH --mail-user=username@tacc.utexas.edu
#SBATCH --mail-type=all # Send email at begin and end of job
#SBATCH -A myproject # Allocation name (req'd if you have more than 1)
# Other commands must follow all #SBATCH directives...
module list
pwd
date
# Set thread count (default value is 1)...
export OMP_NUM_THREADS=16
# Launch MPI code...
ibrun ./mycode.exe # Use ibrun instead of mpirun or mpiexec
# ---------------------------------------------------</pre>
</div></td>
<td valign="top"><a href="javascript:showhideskxhybrid()"><img src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000" id="img-skxhybrid">SKX Hybrid Job in Normal Queue</a>
<div id="skxhybrid" style="display:none">
<pre>
#!/bin/bash
#----------------------------------------------------
# Example Slurm job script
# for TACC Stampede2 SKX nodes
#
# *** Hybrid Job on SKX Normal Queue ***
#
# This sample script specifies:
# 10 nodes (capital N)
# 40 total MPI tasks (lower case n); this is 4 tasks/node
# 12 OpenMP threads per MPI task (48 threads per node)
#
# Last revised: 20 Oct 2017
#
# Notes:
#
# -- Launch this script by executing
# "sbatch skx.mpi.slurm" on Stampede2 login node.
#
# -- Use ibrun to launch MPI codes on TACC systems.
# Do not use mpirun or mpiexec.
#
# -- In most cases it's best to keep
# ( MPI ranks per node ) x ( threads per rank )
# to a number no more than 48 (total cores).
#
# -- If you're running out of memory, try running
# fewer tasks and/or threads per node to give each
# process access to more memory.
#
# -- IMPI and MVAPICH2 both do sensible process pinning by default.
#
#----------------------------------------------------
#SBATCH -J myjob # Job name
#SBATCH -o myjob.o%j # Name of stdout output file
#SBATCH -e myjob.e%j # Name of stderr error file
#SBATCH -p skx-normal # Queue (partition) name
#SBATCH -N 10 # Total # of nodes
#SBATCH -n 40 # Total # of mpi tasks
#SBATCH -t 01:30:00 # Run time (hh:mm:ss)
#SBATCH --mail-user=username@tacc.utexas.edu
#SBATCH --mail-type=all # Send email at begin and end of job
#SBATCH -A myproject # Allocation name (req'd if you have more than 1)
# Other commands must follow all #SBATCH directives...
module list
pwd
date
# Set thread count (default value is 1)...
export OMP_NUM_THREADS=12
# Launch MPI code...
ibrun ./mycode.exe # Use ibrun instead of mpirun or mpiexec
# ---------------------------------------------------</pre>
</div></td>
</tr>
</tbody>
</table>
</center>
<p>&nbsp;</p>
<p>Your job will run in the environment it inherits at submission time; this environment includes the modules you have loaded and the current working directory. In most cases you should <strong>run your applications(s) after loading the same modules that you used to build them</strong>. You can of course use your job submission script to modify this environment by defining new environment variables; changing the values of existing environment variables; loading or unloading modules; changing directory; or specifying relative or absolute paths to files. <strong>Do not use the Slurm <span style="white-space: nowrap;">"<code>--export</code>"</span> option to manage your job's environment</strong>: doing so can interfere with the way the system propagates the inherited environment.</p>
<p>The <a href="#table6">Common <code>sbatch</code> Options table</a> below describes some of the most common <code>sbatch</code> command options. Slurm directives begin with "<code>#SBATCH</code>"; most have a short form (e.g. <span style="white-space: nowrap;">"<code>-N</code>"</span>) and a long form (e.g. <span style="white-space: nowrap;">"<code>--nodes</code>"</span>). You can pass options to <code>sbatch</code> using either the command line or job script; most users find that the job script is the easier approach. The first line of your job script must specify the interpreter that will parse non-Slurm commands; in most cases <span style="white-space: nowrap;">"<code>#!/bin/bash</code>"</span> or <span style="white-space: nowrap;">"<code>#!/bin/csh</code>"</span> is the right choice. Avoid <span style="white-space: nowrap;">"<code>#!/bin/sh</code>"</span> (its startup behavior can lead to subtle problems on Stampede2), and do not include comments or any other characters on this first line. All <code>#SBATCH</code> directives must precede all shell commands. Note also that certain <code>#SBATCH</code> options or combinations of options are mandatory, while others are not available on Stampede2.</p>
<div id="table6">
<p><a href="#table6">Table 6. Common <code>sbatch</code> Options</a></p>
<table border="1" cellpadding="3">
<tbody>
<tr>
<th>Option</th>
<th>Argument</th>
<th>Comments</th>
</tr>
<tr>
<td><code>-p</code></td>
<td><i>queue_name</i></td>
<td>Submits to queue (partition) designated by <i>queue_name</i></td>
</tr>
<tr>
<td><code>-J</code></td>
<td><i>job_name</i></td>
<td>Job Name</td>
</tr>
<tr>
<td><code>-N</code></td>
<td><i>total_nodes</i></td>
<td>Required. Define the resources you need by specifying either:<br>(1) "<code>-N</code>" and "<code>-n</code>"; or<br>(2) "<code>-N</code>" and "<code>--ntasks-per-node</code>".</td>
</tr>
<tr>
<td><code>-n</code></td>
<td><i>total_tasks</i></td>
<td>This is total MPI tasks in this job. See "<code>-N</code>" above for a good way to use this option. When using this option in a non-MPI job, it is usually best to set it to the same value as "<code>-N</code>".</td>
</tr>
<tr>
<td nowrap><code>--ntasks-per-node</code><br>or<br><code>--tasks-per-node</code></td>
<td><i>tasks_per_node</i></td>
<td>This is MPI tasks per node. See "<code>-N</code>" above for a good way to use this option. When using this option in a non-MPI job, it is usually best to set <code>--ntasks-per-node</code> to 1.</td>
</tr>
<tr>
<td><code>-t</code></td>
<td><i>hh:mm:ss</i></td>
<td>Required. Wall clock time for job.</td>
</tr>
<tr>
<td><code>--mail-user=</code></td>
<td><i>email_address</i></td>
<td>Specify the email address to use for notifications. Use with the <code>--mail-type=</code> flag below.</td>
</tr>
<tr>
<td><code>--mail-type=</code></td>
<td><code>begin</code>, <code>end</code>, <code>fail</code>, or <code>all</code></td>
<td>Specify when user notifications are to be sent (one option per line).</td>
</tr>
<tr>
<td><code>-o</code></td>
<td><i>output_file</i></td>
<td>Direct job standard output to <i>output_file</i> (without <code>-e</code> option error goes to this file)</td>
</tr>
<tr>
<td><code>-e</code></td>
<td><i>error_file</i></td>
<td>Direct job error output to <i>error_file</i></td>
</tr>
<tr>
<td><code>-d=</code></td>
<td>afterok:<i>jobid</i></td>
<td>Specifies a dependency: this run will start only after the specified job (<i>jobid</i>) successfully finishes</td>
</tr>
<tr>
<td><code>-A</code></td>
<td><i>projectnumber</i></td>
<td>Charge job to the specified project/allocation number. This option is only necessary for logins associated with multiple projects.</td>
</tr>
<tr>
<td><code>-a</code><br>or<br><code>--array</code></td>
<td>N/A</td>
<td>Not available. Use the <code>launcher</code> module for parameter sweeps and other collections of related serial jobs.</td>
</tr>
<tr>
<td><code>--mem</code></td>
<td>N/A</td>
<td>Not available. If you attempt to use this option, the scheduler will not accept your job.</td>
</tr>
<tr>
<td><code>--export=</code></td>
<td>N/A</td>
<td>Avoid this option on Stampede2. Using it is rarely necessary and can interfere with the way the system propagates your environment.</td>
</tr>
</tbody>
</table>
<p> </p>
<p>By default, Slurm writes all console output to a file named <span style="white-space: nowrap;">"<code>slurm-%j.out</code>"</span>, where <code>%j</code> is the numerical job ID. To specify a different filename use the <span style="white-space: nowrap;">"<code>-o</code>"</span> option. To save <code>stdout</code> (standard out) and <code>stderr</code> (standard error) to separate files, specify both <span style="white-space: nowrap;">"<code>-o</code>"</span> and <span style="white-space: nowrap;">"<code>-e</code>"</span>.</p>
<div id="running-launching">
<h2 id="launching-applications"><a href="#running-launching">Launching Applications</a></h2>
<p>The primary purpose of your job script is to launch your research application. How you do so depends on several factors, especially (1) the type of application (e.g. MPI, OpenMP, serial), and (2) what you're trying to accomplish (e.g. launch a single instance, complete several steps in a workflow, run several applications simultaneously within the same job). While there are many possibilities, your own job script will probably include a launch line that is a variation of one of the examples described in this section:</p>
<ul>
<li><a href="#running-launching-serial">Launching One Serial Application</a></li>
<li><a href="#running-launching-multi">Launching One Multi-Threaded Application</a></li>
<li><a href="#running-launching-mpi">Launching One MPI Application</a></li>
<li><a href="#running-launching-hybrid">Launching One Hybrid (MPI+Threads) Application</a></li>
<li><a href="#running-launching-serialmorethanone">More Than One Serial Application in the Same Job</a></li>
<li><a href="#running-launching-mpisimultaneous">More than One MPI Application Running Concurrently</a></li>
<li><a href="#running-launching-openmpsimultaneous">More than One OpenMP Application Running Concurrently</a></li>
</ul>
<div id="running-launching-serial">
<h3 id="launching-one-serial-application"><a href="#running-launching-serial">Launching One Serial Application</a></h3>
<p>To launch a serial application, simply call the executable. Specify the path to the executable in either the PATH environment variable or in the call to the executable itself:</p>
<pre class="job-script">
mycode.exe # executable in a directory listed in $PATH
$WORK/apps/myprov/mycode.exe # explicit full path to executable
./mycode.exe # executable in current directory
./mycode.exe -m -k 6 input1 # executable with notional input options</pre>
</div>
<div id="running-launching-multi">
<h3 id="launching-one-multi-threaded-application"><a href="#running-launching-multi">Launching One Multi-Threaded Application</a></h3>
<p>Launch a threaded application the same way. Be sure to specify the number of threads. <strong>Note that the default OpenMP thread count is 1</strong>.</p>
<pre class="job-script">
export OMP_NUM_THREADS=68 # 68 total OpenMP threads (1 per KNL core)
./mycode.exe</pre>
</div>
<div id="running-launching-mpi">
<h3 id="launching-one-mpi-application"><a href="#running-launching-mpi">Launching One MPI Application</a></h3>
<p>To launch an MPI application, use the TACC-specific MPI launcher "<code>ibrun</code>", which is a Stampede2-aware replacement for generic MPI launchers like <code>mpirun</code> and <code>mpiexec</code>. In most cases the only arguments you need are the name of your executable followed by any options your executable needs. When you call <code>ibrun</code> without other arguments, your Slurm <code>#SBATCH</code> directives will determine the number of ranks (MPI tasks) and number of nodes on which your program runs.</p>
<pre class="job-script">
ibrun ./mycode.exe # use ibrun instead of mpirun or mpiexec</pre>
</div>
<div id="running-launching-hybrid">
<h3 id="launching-one-hybrid-mpithreads-application"><a href="#running-launching-hybrid">Launching One Hybrid (MPI+Threads) Application</a></h3>
<p>When launching a single application you generally don't need to worry about affinity: both Intel MPI and MVAPICH2 will distribute and pin tasks and threads in a sensible way.</p>
<pre class="job-script">
export OMP_NUM_THREADS=8 # 8 OpenMP threads per MPI rank
ibrun ./mycode.exe # use ibrun instead of mpirun or mpiexec</pre>
</div>
<div id="running-launching-serialmorethanone">
<h3 id="more-than-one-serial-application-in-the-same-job"><a href="#running-launching-serialmorethanone">More Than One Serial Application in the Same Job</a></h3>
<p>TACC's "<code>launcher</code>" utility provides an easy way to launch more than one serial application in a single job. This is a great way to engage in a popular form of High Throughput Computing: running parameter sweeps (one serial application against many different input datasets) on several nodes simultaneously. The <code>launcher</code> utility will execute your specified list of independent serial commands, distributing the tasks evenly, pinning them to specific cores, and scheduling them to keep cores busy. Execute "<code>module load launcher</code>" followed by "<code>module help launcher</code>" for more information.</p>
</div>
<div id="running-launching-consecutivempi">
<h3 id="mpi-applications-one-at-a-time"><a href="#running-launching-consecutivempi">MPI Applications One at a Time</a></h3>
<p>To run one MPI application after another (or any sequence of commands one at a time), simply list them in your job script in the order in which you'd like them to execute. When one application/command completes, the next one will begin.</p>
<pre class="job-script">
module load git
module list
./preprocess.sh
ibrun ./mycode.exe input1 # runs after preprocess.sh completes
ibrun ./mycode.exe input2 # runs after previous MPI app completes</pre>
</div>
<div id="running-launching-mpisimultaneous">
<h3 id="more-than-one-mpi-application-running-concurrently"><a href="#running-launching-mpisimultaneous">More than One MPI Application Running Concurrently</a></h3>
<p>To run more than one MPI application simultaneously in the same job, you need to do several things:</p>
<ul>
<li>use ampersands to launch each instance in the background;</li>
<li>include a <code>wait</code> command to pause the job script until the background tasks complete;</li>
<li>use the <code>ibrun "-n"</code> and "<code>-o</code>" switches to specify task counts and hostlist offsets respectively; and</li>
<li>include a call to the <code>task_affinity</code> script in your <code>ibrun</code> launch line.</li>
</ul>
<p>If, for example, you use <code>#SBATCH</code> directives to request N=4 nodes and n=128 total MPI tasks, Slurm will generate a hostfile with 128 entries (32 entries for each of 4 nodes). The "<code>-n</code>" and "<code>-o</code>" switches, which must be used together, determine which hostfile entries ibrun uses to launch a given application; execute <span style="white-space: nowrap;">"<code>ibrun --help</code>"</span> for more information. <strong>Don't forget the ampersands ("<code>&amp;</code>")</strong> to launch the jobs in the background, <strong>and the "<code>wait</code>" command</strong> to pause the script until the background tasks complete:</p>
<pre class="job-script">
ibrun -n 64 -o 0 task_affinity ./mycode.exe input1 &amp; # 64 tasks; offset by 0 entries in hostfile.
ibrun -n 64 -o 64 task_affinity ./mycode.exe input2 &amp; # 64 tasks; offset by 64 entries in hostfile.
wait # Required; else script will exit immediately.</pre>
<p>The <code>task_affinity</code> script does two things:</p>
<ul>
<li><code>task_affinity</code> manages task placement and pinning when you call <code>ibrun</code> with the "<code>-n</code>, <code>-o</code>" switches (it's not necessary under any other circumstances); and</li>
<li><code>task_affinity</code> also manages MCDRAM when you run in flat-quadrant mode on the KNL. It does this in the same way as <a href="#managing-memory"><code>mem_affinity</code></a>.</li>
<li><strong>Don't confuse <code>task_affinity</code> with <a href="#managing-memory"><code>tacc_affinity</code></a></strong>; the keyword "<code>tacc_affinity</code>" is now a symbolic link to <code>mem_affinity</code>. The <code>mem_affinity</code> script and the symbolic link <code>tacc_affinity</code> manage MCDRAM in flat-quadrant mode on the KNL, but they do not pin MPI tasks.</li>
</ul>
</div>
<div id="running-launching-openmpsimultaneous">
<h3 id="more-than-one-openmp-application-running-concurrently"><a href="#running-launching-openmpsimultaneous">More than One OpenMP Application Running Concurrently</a></h3>
<p>You can also run more than one OpenMP application simultaneously on a single node, but you will need to <a href="http://pages.tacc.utexas.edu/~eijkhout/pcse/html/omp-affinity.html">distribute and pin tasks appropriately</a>. In the example below, <span style="white-space: nowrap;">"<code>numactl -C</code>"</span> specifies virtual CPUs (hardware threads). According to the numbering scheme for KNL hardware threads, CPU (hardware thread) numbers 0-67 are spread across the 68 cores, 1 thread per core. Similarly for SKX: CPU (hardware thread) numbers 0-47 are spread across the 48 cores, 1 thread per core. See <a href="http://portal.tacc.utexas.edu/training#/session/64">TACC training materials</a> for more information.</p>
<pre class="job-script">
export OMP_NUM_THREADS=2
numactl -C 0-1 ./mycode.exe inputfile1 &amp; # HW threads (hence cores) 0-1. Note ampersand.
numactl -C 2-3 ./mycode.exe inputfile2 &amp; # HW threads (hence cores) 2-3. Note ampersand.
wait</pre>
</div>
</div>
<div id="running-idev">
<h2 id="interactive-sessions-with-idev-and-srun"><a href="#running-idev">Interactive Sessions with <code>idev</code> and <code>srun</code></a></h2>
<p>TACC's own <code>idev</code> utility is the best way to begin an interactive session on one or more compute nodes. To launch a thirty-minute session on a single node in the development queue, simply execute:</p>
<pre class="cmd-line">login1$ <b>idev</b></pre>
<p>You'll then see output that includes the following excerpts:</p>
<pre class="cmd-line">
...
-----------------------------------------------------------------
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Welcome to the Stampede2 Supercomputer
-----------------------------------------------------------------
...
-&gt; After your idev job begins to run, a command prompt will appear,
-&gt; and you can begin your interactive development session.
-&gt; We will report the job status every 4 seconds: (PD=pending, R=running).
-&gt;job status: PD
-&gt;job status: PD
...
c449-001$</pre>
<p>The "<code>job status</code>" messages indicate that your interactive session is waiting in the queue. When your session begins, you'll see a command prompt on a compute node (in this case, the node with hostname c449-001). If this is the first time you launch <code>idev</code>, the prompts may invite you to choose a default project and a default number of tasks per node for future <code>idev</code> sessions.</p>
<p>For command line options and other information, execute <span style="white-space: nowrap;">"<code>idev --help</code>"</span>. It's easy to tailor your submission request (e.g. shorter or longer duration) using Slurm-like syntax:</p>
<pre class="cmd-line">login1$ <b>idev -p normal -N 2 -n 8 -m 150</b> # normal queue, 2 nodes, 8 total tasks, 150 minutes</pre>
<p>For more information see the <a href="http://portal.tacc.utexas.edu/software/idev"><code>idev</code> documentation</a>.</p>
<p>You can also launch an interactive session with Slurm's <code>srun</code> command, though there's no clear reason to prefer <code>srun</code> to <code>idev</code>. A typical launch line would look like this:</p>
<pre class="cmd-line">login1$ <b>srun --pty -N 2 -n 8 -t 2:30:00 -p normal /bin/bash -l </b># same conditions as above</pre>
</div>
<div id="running-ssh">
<h2 id="interactive-sessions-using-ssh"><a href="#running-ssh">Interactive Sessions using <code>ssh</code></a></h2>
<p>If you have a batch job or interactive session running on a compute node, you "own the node": you can connect via <code>ssh</code> to open a new interactive session on that node. This is an especially convenient way to monitor your applications' progress. One particularly helpful example: login to a compute node that you own, execute "<code>top</code>", then press the "1" key to see a display that allows you to monitor thread ("CPU") and memory use.</p>
<p>There are many ways to determine the nodes on which you are running a job, including feedback messages following your <code>sbatch</code> submission, the compute node command prompt in an <code>idev</code> session, and the <code>squeue</code> or <code>showq</code> utilities. The sequence of identifying your compute node then connecting to it would look like this:</p>
<pre class="cmd-line">
login1$ <b>squeue -u bjones</b>
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
858811 development idv46796 bjones R 0:39 1 c448-004
1ogin1$ <b>ssh c448-004</b>
...
C448-004$</pre>
</div>
<div id="running-slurmenvvars">
<h2 id="slurm-environment-variables"><a href="#running-slurmenvvars">SLURM Environment Variables</a></h2>
<p>Be sure to distinguish between internal Slurm replacement symbols (e.g. "<code>%j</code>" described above) and Linux environment variables defined by Slurm (e.g. <code>SLURM_JOBID</code>). Execute <span style="white-space: nowrap;">"<code>env | grep SLURM</code>"</span> from within your job script to see the full list of Slurm environment variables and their values. You can use Slurm replacement symbols like "<code>%j</code>" only to construct a Slurm filename pattern; they are not meaningful to your Linux shell. Conversely, you can use Slurm environment variables in the shell portion of your job script but not in an <code>#SBATCH</code> directive. For example, the following directive will not work the way you might think:</p>
<pre class="job-script"><s>#SBATCH -o myMPI.o${SLURM_JOB_ID}</s> # incorrect</pre>
<p>Instead, use the following directive:</p>
<pre class="job-script">#SBATCH -o myMPI.o%j # "%j" expands to your job's numerical job ID</pre>
<p>Similarly, you cannot use paths like <code>$WORK</code> or <code>$SCRATCH</code> in an <code>#SBATCH</code> directive.</p>
<p>For more information on this and other matters related to Slurm job submission, see the <a href="https://slurm.schedmd.com/sbatch.html">Slurm online documentation</a>; the man pages for both Slurm itself (<span style="white-space: nowrap;">"<code>man slurm</code>"</span>) and its individual command (e.g. <span style="white-space: nowrap;">"<code>man sbatch</code>"</span>); as well as numerous other online resources.</p>
</div>
</div>
<div id="monitoring">
<h1 id="monitoring-jobs-and-queues"><a href="#monitoring">Monitoring Jobs and Queues</a></h1>
<p>Several commands are available to help you plan and track your job submissions as well as check the status of the Slurm queues.</p>
<p>When interpreting queue and job status, remember that <strong>Stampede2 doesn't operate on a first-come-first-served basis</strong>. Instead, the sophisticated, tunable algorithms built into Slurm attempt to keep the system busy, while scheduling jobs in a way that is as fair as possible to everyone. At times this means leaving nodes idle ("draining the queue") to make room for a large job that would otherwise never run. It also means considering each user's "fair share", scheduling jobs so that those who haven't run jobs recently may have a slightly higher priority than those who have.</p>
<div id="monitoring-queue">
<h2 id="monitoring-queue-status-with-sinfo-and-qlimits"><a href="#monitoring-queue">Monitoring Queue Status with <code>sinfo</code> and <code>qlimits</code></a></h2>
<p>To display resource limits for the Stampede2 queues, execute "<strong><code>qlimits</code></strong>". The result is real-time data; the corresponding information in this document's <a href="#running-queues">table of Stampede2 queues</a> may lag behind the actual configuration that the <code>qlimits</code> utility displays.</p>
<p>Slurm's "<strong><code>sinfo</code></strong>" command allows you to monitor the status of the queues. If you execute <code>sinfo</code> without arguments, you'll see a list of every node in the system together with its status. To skip the node list and produce a tight, alphabetized summary of the available queues and their status, execute:</p>
<pre class="cmd-line">login1$ <b>sinfo -S+P -o "%18P %8a %20F"</b> # compact summary of queue status</pre>
<p>An excerpt from this command's output looks like this:</p>
<pre class="cmd-line">
PARTITION AVAIL NODES(A/I/O/T)
development* up 41/70/1/112
normal up 3685/8/3/3696</pre>
<p>The <code>AVAIL</code> column displays the overall status of each queue (up or down), while the column labeled "<code>NODES(A/I/O/T)</code>" shows the number of nodes in each of several states ("<strong>A</strong>llocated", "<strong>I</strong>dle", "<strong>O</strong>ther", and "<strong>T</strong>otal"). Execute "<code>man sinfo</code>" for more information. Use caution when reading the generic documentation, however: some available fields are not meaningful or are misleading on Stampede2 (e.g. <code>TIMELIMIT</code>, displayed using the "<code>%l</code>" option).</p>
</div>
<div id="monitoring-squeue">
<h2 id="monitoring-job-status-with-squeue"><a href="#monitoring-squeue">Monitoring Job Status with <code>squeue</code></a></h2>
<p>Slurm's <code>squeue</code> command allows you to monitor jobs in the queues, whether pending (waiting) or currently running:</p>
<pre class="cmd-line">
login1$ <b>squeue</b> # show all jobs in all queues
login1$ <b>squeue -u bjones</b> # show all jobs owned by bjones
login1$ <b>man squeue</b> # more info</pre>
<p>An excerpt from the default output looks like this:</p>
<pre class="cmd-line">
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
170361 normal spec12 bjones PD 0:00 32 (Resources)
170356 normal mal2d slindsey PD 0:00 30 (Priority)
170204 normal rr2-a2 tg123456 PD 0:00 1 (Dependency)
170250 development idv59074 aturing R 29:30 1 c455-044
169669 normal 04-99a1 aturing CG 2:47:47 1 c425-003</pre>
<p>The column labeled "<code>ST</code>" displays each job's status:</p>
<ul>
<li>"<code>PD</code>" means "Pending" (waiting);</li>
<li>"<code>R</code>" means "Running";</li>
<li>"<code>CG</code>" means "Completing" (cleaning up after exiting the job script).</li>
</ul>
<p>Pending jobs appear in order of decreasing priority. The last column includes a nodelist for running/completing jobs, or a reason for pending jobs. If you submit a job before a scheduled system maintenance period, and the job cannot complete before the maintenance begins, your job will run when the maintenance/reservation concludes. The <code>squeue</code> command will report "<code>ReqNodeNotAvailable</code>" ("Required Node Not Available"). The job will remain in the <code>PD</code> state until Stampede2 returns to production.</p>
<p>The default format for <code>squeue</code> now reports total nodes associated with a job rather than cores, tasks, or hardware threads. One reason for this change is clarity: the operating system sees each KNL node's 272 hardware threads (and each SKX node's 96 hardware threads) as "processors", and output based on that information can be ambiguous or otherwise difficult to interpret.</p>
<p>The default format lists all nodes assigned to displayed jobs; this can make the output difficult to read. A handy variation that suppresses the nodelist is:</p>
<pre class="cmd-line">login1$ <b>squeue -o "%.10i %.12P %.12j %.9u %.2t %.9M %.6D"</b> # suppress nodelist</pre>
<p>The "<code>--start</code>" option displays job start times, including very rough estimates for the expected start times of some pending jobs that are relatively high in the queue:</p>
<pre class="cmd-line">login1$ <b>squeue --start -j 167635</b> # display estimated start time for job 167635</pre>
</div>
<div id="monitoring-showq">
<h2 id="monitoring-job-status-with-showq"><a href="#monitoring-showq">Monitoring Job Status with <code>showq</code></a></h2>
<p>TACC's "<code>showq</code>" utility mimics a tool that originated in the PBS project, and serves as a popular alternative to the Slurm "<code>squeue</code>" command:</p>
<pre class="cmd-line">
login1$ <b>showq</b> # show all jobs; default format
login1$ <b>showq -u</b> # show your own jobs
login1$ <b>showq -U bjones</b> # show jobs associated with user bjones
login1$ <b>showq -h</b> # more info</pre>
<p>The output groups jobs in four categories: <code>ACTIVE</code>, <code>WAITING</code>, <code>BLOCKED</code>, and <code>COMPLETING/ERRORED</code>. A <strong><code>BLOCKED</code></strong> job is one that cannot yet run due to temporary circumstances (e.g. a pending maintenance or other large reservation.).</p>
<p>If your waiting job cannot complete before a maintenance/reservation begins, <code>showq</code> will display its state as "<strong><code>WaitNod</code>"</strong> ("Waiting for Nodes"). The job will remain in this state until Stampede2 returns to production.</p>
<p>The default format for <code>showq</code> now reports total nodes associated with a job rather than cores, tasks, or hardware threads. One reason for this change is clarity: the operating system sees each KNL node's 272 hardware threads (and each SKX node's 96 hardware threads) as "processors", and output based on that information can be ambiguous or otherwise difficult to interpret.</p>
</div>
<div id="monitoring-other">
<h2 id="other-job-management-commands-scancel-scontrol-and-sacct"><a href="#monitoring-other">Other Job Management Commands (<code>scancel</code>, <code>scontrol</code>, and <code>sacct</code>)</a></h2>
<p><strong>It's not possible to add resources to a job (e.g. allow more time)</strong> once you've submitted the job to the queue.</p>
<p>To <strong>cancel</strong> a pending or running job, first determine its jobid, then use <code>scancel</code>:</p>
<pre class="cmd-line">
login1$ <b>squeue -u bjones</b> # one way to determine jobid
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
170361 normal spec12 bjones PD 0:00 32 (Resources)
login1$ <b>scancel 170361</b> # cancel job</pre>
<p>For <strong>detailed information</strong> about the configuration of a specific job, use <code>scontrol</code>:</p>
<pre class="cmd-line">login1$ <b>scontrol show job=170361</b></pre>
<p>To view some <strong>accounting data</strong> associated with your own jobs, use <code>sacct</code>:</p>
<pre class="cmd-line">login1$ <b>sacct --starttime 2017-08-01</b> # show jobs that started on or after this date</pre>
</div>
<div id="monitoring-dependent">
<h2 id="dependent-jobs-using-sbatch"><a href="#monitoring-dependent">Dependent Jobs using <code>sbatch</code></a></h2>
<p>You can use <code>sbatch</code> to help manage workflows that involve multiple steps: the "<code>--dependency</code>" option allows you to launch jobs that depend on the completion (or successful completion) of another job. For example you could use this technique to split into three jobs a workflow that requires you to (1) compile on a single node; then (2) compute on 40 nodes; then finally (3) post-process your results using 4 nodes.</p>
<pre class="cmd-line">login1$ <b>sbatch --dependency=afterok:173210 myjobscript</b></pre>
<p>For more information see the <a href="http://www.schedmd.com">Slurm online documentation</a>. Note that you can use <code>$SLURM_JOBID</code> from one job to find the jobid you'll need to construct the <code>sbatch</code> launch line for a subsequent one. But also remember that you can't use <code>sbatch</code> to submit a job from a compute node.</p>
</div>
</div>
<div id="vis">
<h1 id="visualization-and-virtual-network-computing-vnc-sessions"><a href="#vis">Visualization and Virtual Network Computing (VNC) Sessions</a></h1>
<p>Stampede2 uses the SKX and KNL processors for all visualization and rendering operations. We use the Intel OpenSWR library to render raster graphics with OpenGL, and the Intel OSPRay framework for ray traced images inside visualization software. <strong>On Stampede2, "<code>swr</code>" replaces "<code>vglrun</code>" (e.g. "<code>swr glxgears</code>") and uses similar syntax.</strong> OpenSWR can be loaded by executing "<code>module load swr</code>". We expect most users will notice little difference in visualization experience on KNL. MCDRAM may improve visualization performance for some users. SKX nodes may provide better interactivity for intensive rendering applications.</p>
<p>There is currently no separate visualization queue on Stampede2. All visualization apps are available on all nodes. VNC and DCV sessions are available on any queue, either through the command line or via the <a href="https://vis.tacc.utexas.edu/">TACC Visualization Portal</a>. We recommend submitting to the <code>development</code> queue (for KNL) or the <code>skx-dev</code> queue (for SKX) for interactive sessions. If you are interested in an application that is not yet available, please submit a help desk ticket through the TACC or XSEDE User Portal.</p>
<div id="vis-remote">
<h2 id="remote-desktop-access"><a href="#vis-remote">Remote Desktop Access</a></h2>
<p>Remote desktop access to Stampede2 is formed through a VNC connection to one or more visualization nodes. Users must first connect to a Stampede2 login node (see System Access) and submit a special interactive batch job that:</p>
<ul>
<li>allocates a set of Stampede2 visualization nodes</li>
<li>starts a vncserver process on the first allocated node</li>
<li>sets up a tunnel through the login node to the vncserver access port</li>
</ul>
<p>Once the vncserver process is running on the visualization node and a tunnel through the login node is created, an output message identifies the access port for connecting a VNC viewer. A VNC viewer application is run on the user's remote system and presents the desktop to the user.</p>
<p>Note: If this is your first time connecting to Stampede2, you must run <code>vncpasswd</code> to create a password for your VNC servers. This should NOT be your login password! This mechanism only deters unauthorized connections; it is not fully secure, as only the first eight characters of the password are saved. All VNC connections are tunneled through SSH for extra security, as described below.</p>
<p>Follow the steps below to start an interactive session.</p>
<ol type="1">
<li><p>Start a Remote Desktop</p> <p>TACC has provided a VNC job script (<code>/share/doc/slurm/job.vnc</code>) that requests one node in the <a href="#running-queues"><code>development</code> queue</a> for two hours, creating a <a href="https://en.wikipedia.org/wiki/VNC">VNC</a> session.</p> <pre class="cmd-line">login1$ <b>sbatch /share/doc/slurm/job.vnc</b></pre> <p>You may modify or overwrite script defaults with <code>sbatch</code> command-line options:</p>
<ul>
<li>"<code>-t <i>hours:minutes:seconds</i></code>" modify the job runtime</li>
<li>"<code>-A <i>projectnumber</i></code>" specify the project/allocation to be charged</li>
<li>"<code>-N <i>nodes</i></code>" specify number of nodes needed</li>
<li>"<code>-p <i>partition</i></code>" specify an alternate queue.</li>
</ul> <p>See more <code>sbatch</code> options in the <a href="#table6">Common <code>sbatch</code> Options</a></p> <p>All arguments after the job script name are sent to the vncserver command. For example, to set the desktop resolution to 1440x900, use:</p> <pre class="cmd-line">login1$ <b>sbatch /share/doc/slurm/job.vnc -geometry 1440x900</b></pre> <p>The "<code>vnc.job</code>" script starts a vncserver process and writes to the output file, "<code>vncserver.out</code>" in the job submission directory, with the connect port for the vncviewer. Watch for the "To connect via VNC client" message at the end of the output file, or watch the output stream in a separate window with the commands:</p> <pre class="cmd-line">login1$ <b>touch vncserver.out ; tail -f vncserver.out</b></pre> <p>The lightweight window manager, <code>xfce</code>, is the default VNC desktop and is recommended for remote performance. Gnome is available; to use gnome, open the "<code>~/.vnc/xstartup</code>" file (created after your first VNC session) and replace "<code>startxfce4</code>" with "<code>gnome-session</code>". Note that gnome may lag over slow internet connections.</p></li>
<li><p>Create an SSH Tunnel to Stampede2</p> <p>TACC requires users to create an SSH tunnel from the local system to the Stampede2 login node to assure that the connection is secure. On a Unix or Linux system, execute the following command once the port has been opened on the Stampede2 login node:</p> <pre class="cmd-line">
localhost$ <b>ssh -f -N -L <i>xxxx</i>:stampede2.tacc.utexas.edu:<i>yyyy</i>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;username@stampede2.tacc.utexas.edu</b></pre> <p>where</p>
<ul>
<li>"<code><i>yyyy</i></code>" is the port number given by the vncserver batch job</li>
<li>"<code><i>xxxx</i></code>" is a port on the remote system. Generally, the port number specified on the Stampede2 login node, <code><i>yyyy</i></code>, is a good choice to use on your local system as well</li>
<li>"<code>-f</code>" instructs SSH to only forward ports, not to execute a remote command</li>
<li>"<code>-N</code>" puts the <code>ssh</code> command into the background after connecting</li>
<li>"<code>-L</code>" forwards the port</li>
</ul> <p>On Windows systems find the menu in the Windows SSH client where tunnels can be specified, and enter the local and remote ports as required, then <code>ssh</code> to Stampede2.</p></li>
<li><p>Connecting vncviewer</p> <p>Once the SSH tunnel has been established, use a <a href="https://en.wikipedia.org/wiki/Virtual_Network_Computing">VNC client</a> to connect to the local port you created, which will then be tunneled to your VNC server on Stampede2. Connect to <code>localhost:<i>xxxx</i></code>, where <code><i>xxxx</i></code> is the local port you used for your tunnel. In the examples above, we would connect the VNC client to <code>localhost::<i>xxxx</i></code>. (Some VNC clients accept <code>localhost:<i>xxxx</i></code>).</p> <p>We recommend the <a href="http://sourceforge.net/projects/tigervnc/">TigerVNC</a> VNC Client, a platform independent client/server application.</p> <p>Once the desktop has been established, two initial xterm windows are presented (which may be overlapping). One, which is white-on-black, manages the lifetime of the VNC server process. Killing this window (typically by typing "<code>exit</code>" or "<code>ctrl-D</code>" at the prompt) will cause the vncserver to terminate and the original batch job to end. Because of this, we recommend that this window not be used for other purposes; it is just too easy to accidentally kill it and terminate the session.</p> <p>The other xterm window is black-on-white, and can be used to start both serial programs running on the node hosting the vncserver process, or parallel jobs running across the set of cores associated with the original batch job. Additional xterm windows can be created using the window-manager left-button menu.</p></li>
</ol>
</div>
<div id="vis-apps">
<h2 id="running-applications-on-the-vnc-desktop"><a href="#vis-apps">Running Applications on the VNC Desktop</a></h2>
<p>From an interactive desktop, applications can be run from icons or from xterm command prompts. Two special cases arise: running parallel applications, and running applications that use OpenGL.</p>
</div>
<div id="vis-parallelapps">
<h2 id="running-parallel-applications-from-the-desktop"><a href="#vis-parallelapps">Running Parallel Applications from the Desktop</a></h2>
<p>Parallel applications are run on the desktop using the same <code>ibrun</code> wrapper described above (see <a href="#running">Running</a>). The command:</p>
<pre class="cmd-line">c442-001$ <b>ibrun <i>ibrunoptions</i> application <i>applicationoptions</i></b></pre>
<p>will run application on the associated nodes, as modified by the <code>ibrun</code> options.</p>
</div>
<div id="vis-opengl">
<h2 id="running-openglx-applications-on-the-desktop"><a href="#vis-opengl">Running OpenGL/X Applications On The Desktop</a></h2>
<p>Stampede2 uses the OpenSWR OpenGL library to perform efficient rendering. At present, the compute nodes on Stampede2 do not support native X instances. All windowing environments should use a VNC desktop launched via the job script in /share/doc/slurm/job.vnc or using the TACC Vis portal.</p>
<p>swr: To access the accelerated OpenSWR OpenGL library, it is necessary to use the swr module to point to the swr OpenGL implementation and configure the number of threads to allocate to rendering.</p>
<pre class="cmd-line">
c442-001$ <b>module load swr</b>
c442-001$ <b>swr <i>options</i> application <i>application-args</i></b></pre>
</div>
<div id="vis-visit">
<h2 id="parallel-visit-on-stampede2"><a href="#vis-visit">Parallel VisIt on Stampede2</a></h2>
<p><a href="https://wci.llnl.gov/simulation/computer-codes/visit/manuals">VisIt</a> was compiled under the Intel compiler and the mvapich2 and MPI stacks.</p>
<p>After connecting to a VNC server on Stampede2, as described above, load the VisIt module at the beginning of your interactive session before launching the Visit application:</p>
<pre class="cmd-line">
c442-001$ <b>module load swr visit</b>
c442-001$ <b>swr visit</b></pre>
<p>VisIt first loads a dataset and presents a dialog allowing for selecting either a serial or parallel engine. Select the parallel engine. Note that this dialog will also present options for the number of processes to start and the number of nodes to use; these options are actually ignored in favor of the options specified when the VNC server job was started.</p>
<div id="vis-visit-preparingdata">
<h3 id="preparing-data-for-parallel-visit"><a href="#vis-visit-preparingdata">Preparing data for Parallel Visit</a></h3>
<p>In order to take advantage of parallel processing, VisIt input data must be partitioned and distributed across the cooperating processes. This requires that the input data be explicitly partitioned into independent subsets at the time it is input to VisIt. VisIt supports <a href="https://en.wikipedia.org/wiki/Silo_%28library%29">SILO</a> data, which incorporates a parallel, partitioned representation. Otherwise, VisIt supports a metadata file (with a <code>.visit</code> extension) that lists multiple data files of any supported format that are to be associated into a single logical dataset. In addition, VisIt supports a "brick of values" format, also using the <code>.visit</code> metadata file, which enables single files containing data defined on rectilinear grids to be partitioned and imported in parallel. Note that VisIt does not support VTK parallel XML formats (<code>.pvti</code>, <code>.pvtu</code>, <code>.pvtr</code>, <code>.pvtp</code>, and <code>.pvts</code>). For more information on importing data into VisIt, see <a href="https://wci.llnl.gov/codes/visit/2.0.0/GettingDataIntoVisIt2.0.0.pdf">Getting Data Into VisIt</a> though this documentation refers to VisIt version 2.0, it appears to be the most current available.</p>
</div>
</div>
<div id="vis-paraview">
<h2 id="parallel-paraview-on-stampede2"><a href="#vis-paraview">Parallel ParaView on Stampede2</a></h2>
<p>After connecting to a VNC server on Stampede2, as described above, do the following:</p>
<ol type="1">
<li><p>Set up your environment with the necessary modules. Load the <code>swr</code>, <code>qt5</code>, <code>ospray</code>, and <code>paraview</code> modules <strong>in this order</strong>:</p> <pre class="cmd-line">c442-001$ <b>module load swr qt5 ospray paraview</b></pre></li>
<li><p>Launch ParaView:</p> <pre class="cmd-line">
c442-001$ <b>swr -p 1 paraview [<i>paraview client options</i>]</b></pre></li>
<li><p>Click the "Connect" button, or select File -&gt; Connect</p></li>
<li><p>Select the "auto" configuration, then press "Connect". In the Paraview Output Messages window, you'll see what appears to be an lmod' error, but can be ignored. Then you'll see the parallel servers being spawned and the connection established.</p></li>
</ol>
</div>
</div>
<div id="programming">
<h1 id="programming-and-performance"><a href="#programming">Programming and Performance</a></h1>
<div id="programming-general">
<h2 id="programming-and-performance-general"><a href="#programming-general">Programming and Performance: General</a></h2>
<p>Programming for performance is a broad and rich topic. While there are no shortcuts, there are certainly some basic principles that are worth considering any time you write or modify code.</p>
<div id="programming-general-timingprofiling">
<h3 id="timing-and-profiling"><a href="#programming-general-timingprofiling">Timing and Profiling</a></h3>
<p><strong>Measure performance and experiment with both compiler and runtime options.</strong> This will help you gain insight into issues and opportunities, as well as recognize the performance impact of code changes and temporary system conditions.</p>
<p>Measuring performance can be as simple as prepending the shell keyword "<code>time</code>" or the command "<code>perf stat</code>" to your launch line. Both are simple to use and require no code changes. Typical calls look like this:</p>
<pre class="cmd-line">
<b>perf stat ./a.out</b> # report basic performance stats for a.out
<b>time ./a.out</b> # report the time required to execute a.out
<b>time ibrun ./a.out</b> # time an MPI code
<b>ibrun time ./a.out</b> # crude timings for each MPI task (no rank info)</pre>
<p>As your needs evolve you can add timing intrinsics to your source code to time specific loops or other sections of code. There are many such intrinsics available; some popular choices include <a href="http://man7.org/linux/man-pages/man2/gettimeofday.2.html"><code>gettimeofday</code></a>, <a href="https://www.mpich.org/static/docs/v3.2/www3/MPI_Wtime.html"><code>MPI_Wtime</code></a> and <a href="https://www.openmp.org/spec-html/5.0/openmpsu160.html"><code>omp_get_wtime</code></a>. The resolution and overhead associated with each of these timers is on the order of a microsecond.</p>
<p>It can be helpful to compare results with different compiler and runtime options: e.g. with and without <a href="http://software.intel.com/en-us/fortran-compiler-18.0-developer-guide-and-reference-vec-qvec">vectorization</a>, <a href="#running-launching-multi">threading</a>, or <a href="#files-striping">Lustre striping</a>. You may also want to learn to use profiling tools like <a href="http://software.intel.com/en-us/intel-vtune-amplifier-xe">Intel VTune Amplifier</a> <span style="white-space: nowrap;">("<code>module load vtune</code>")</span> or GNU <a href="http://sourceware.org/binutils/docs/gprof/"><code>gprof</code></a>.</p>
</div>
<div id="programming-general-datalocality">
<h3 id="data-locality"><a href="#programming-general-datalocality">Data Locality</a></h3>
<p><strong>Appreciate the high cost (performance penalty) of moving data from one node to another</strong>, from disk to RAM, and even from RAM to cache. Write your code to keep data as close to the computation as possible: e.g. in RAM when needed, and on the node that needs it. This means keeping in mind the capacity and characteristics of each level of the memory hierarchy when designing your code and planning your simulations. A simple KNL-specific example illustrates the point: all things being equal, there's a good chance you'll see better performance when you keep your data in the KNL's <a href="#programming-knl-memorymodes">fast MCDRAM</a> instead of the slower DDR4.</p>
<p>When possible, best practice also calls for so-called "stride 1 access" looping through large, contiguous blocks of data, touching items that are adjacent in memory as the loop proceeds. The goal here is to use "nearby" data that is already in cache rather than going back to main memory (a cache miss) in every loop iteration.</p>
<p>To achieve stride 1 access you need to understand how your program stores its data. Here C and C++ are different than (in fact the opposite of) Fortran. C and C++ are row-major: they store 2d arrays a row at a time, so elements <code>a[3][4]</code> and <code>a[3][5]</code> are adjacent in memory. Fortran, on the other hand, is column-major: it stores a column at a time, so elements <code>a(4,3)</code> and <code>a(5,3)</code> are adjacent in memory. Loops that achieve stride 1 access in the two languages look like this:</p>
</div>
<table border="1" cellspacing="3" cellpadding="3">
<tbody>
<tr>
<th>Fortran example</th>
<th>C example</th>
</tr>
<tr>
<td> <pre>real*8 :: a(m,n), b(m,n), c(m,n)
&nbsp;...
! inner loop strides through col i
do i=1,n
&nbsp;&nbsp;do j=1,m
&nbsp;&nbsp;&nbsp;&nbsp;a(j,i)=b(j,i)+c(j,i)
&nbsp;&nbsp;end do
end do</pre> </td>
<td><pre>double a[m][n], b[m][n], c[m][n];
&nbsp;...
// inner loop strides through row i
for (i=0;i&lt;m;i++){
&nbsp;&nbsp;for (j=0;j&lt;n;j++){
&nbsp;&nbsp;&nbsp;&nbsp;a[i][j]=b[i][j]+c[i][j];
&nbsp;&nbsp;}
}</pre> </td>
</tr>
</tbody>
</table>
<div id="programming-general-vectorization">
<h3 id="vectorization"><a href="#programming-general-vectorization">Vectorization</a></h3>
<p><strong>Give the compiler a chance to produce efficient, <a href="http://software.intel.com/en-us/articles/vectorization-essential">vectorized</a> code</strong>. The compiler can do this best when your inner loops are simple (e.g. no complex logic and a straightforward matrix update like the ones in the examples above), long (many iterations), and avoid complex data structures (e.g. objects). See Intel's note on <a href="http://software.intel.com/en-us/node/522571">Programming Guidelines for Vectorization</a> for a nice summary of the factors that affect the compiler's ability to vectorize loops.</p>
<p>It's often worthwhile to generate <a href="http://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports">optimization and vectorization reports</a> when using the Intel compiler. This will allow you to see exactly what the compiler did and did not do with each loop, together with reasons why.</p>
</div>
<div id="programming-general-more">
<h3 id="learning-more"><a href="#programming-general-more">Learning More</a></h3>
<p>The literature on optimization is vast. Some places to begin a systematic study of optimization on Intel processors include: Intel's <a href="http://software.intel.com/en-us/modern-code">Modern Code</a> resources; the <a href="http://intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-optimization-manual">Intel Optimization Reference Manual</a>; and <a href="http://portal.tacc.utexas.edu/training#/session/64">TACC training materials</a>.</p>
</div>
</div>
<div id="programming-knl">
<h2 id="programming-and-performance-knl"><a href="#programming-knl">Programming and Performance: KNL</a></h2>
<div id="programming-knl-architecture">
<h3 id="architecture"><a href="#programming-knl-architecture">Architecture</a></h3>
<p>KNL cores are grouped in pairs; each pair of cores occupies a tile. Since there are 68 cores on each Stampede2 KNL node, each node has 34 active tiles. These 34 active tiles are connected by a two-dimensional mesh interconnect. Each KNL has 2 DDR memory controllers on opposite sides of the chip, each with 3 channels. There are 8 controllers for the fast, on-package MCDRAM, two in each quadrant.</p>
<p>Each core has its own local L1 cache (32KB, data, 32KB instruction) and two 512-bit vector units. Both vector units can execute <code>AVX512</code> instructions, but only one can execute legacy vector instructions (<code>SSE</code>, <code>AVX</code>, and <code>AVX2</code>). Therefore, to use both vector units, you must compile with <span style="white-space: nowrap;"><code>-xMIC-AVX512</code></span>.</p>
<p>Each core can run up to 4 hardware threads. The two cores on a tile share a 1MB L2 cache. Different <a href="#programming-knl-clustermodes">cluster modes</a> specify the L2 cache coherence mechanism at the node level.</p>
</div>
<div id="programming-knl-memorymodes">
<h3 id="memory-modes"><a href="#programming-knl-memorymodes">Memory Modes</a></h3>
<p>The processor's memory mode determines whether the fast MCDRAM operates as RAM, as direct-mapped L3 cache, or as a mixture of the two. The output of commands like "<code>top</code>", "<code>free</code>", and <span style="white-space: nowrap;">"<code>ps -v</code>"</span> reflect the consequences of memory mode. Such commands will show the amount of RAM available to the operating system, not the hardware (DDR + MCDRAM) installed.</p>
</div>
<div id="figure4">
<figure>
<p><img alt="KNL Memory Modes" src="/documents/10157/1334612/KNL+Memory+Modes.png/f19c64b8-6007-4a08-a6f3-80cabf9a2c20" style="width: 800px; height: 171px;"> </p>
<figcaption>
Figure 4. KNL Memory Modes
</figcaption>
</figure>
<ul>
<li><p><strong>Cache Mode</strong>. In this mode, the fast MCDRAM is configured as an L3 cache. The operating system transparently uses the MCDRAM to move data from main memory. In this mode, the user has access to 96GB of RAM, all of it traditional DDR4. <strong>Most Stampede2 KNL nodes are configured in cache mode.</strong></p></li>
<li><p><strong>Flat Mode</strong>. In this mode, DDR4 and MCDRAM act as two distinct Non-Uniform Memory Access (NUMA) nodes. It is therefore possible to specify the type of memory (DDR4 or MCDRAM) when allocating memory. In this mode, the user has access to 112GB of RAM: 96GB of traditional DDR and 16GB of fast MCDRAM. By default, memory allocations occur only in DDR4. To use MCDRAM in flat mode, use the <code>numactl</code> utility or the <code>memkind</code> library; see <a href="#programming-knl-managingmemory">Managing Memory</a> for more information. If you do not modify the default behavior you will have access only to the slower DDR4.</p></li>
<li><p><strong>Hybrid Mode (not available on Stampede2)</strong>. In this mode, the MCDRAM is configured so that a portion acts as L3 cache and the rest as RAM (a second NUMA node supplementing DDR4).</p></li>
</ul>
</div>
<div id="programming-knl-clustermodes">
<h3 id="cluster-modes"><a href="#programming-knl-clustermodes">Cluster Modes</a></h3>
<p>The KNL's core-level L1 and tile-level L2 caches can reduce the time it takes for a core to access the data it needs. To share memory safely, however, there must be mechanisms in place to ensure cache coherency. Cache coherency means that all cores have a consistent view of the data: if data value x changes on a given core, there must be no risk of other cores using outdated values of x. This, of course, is essential on any multi-core chip, but it is especially difficult to achieve on manycore processors.</p>
<p>The details for KNL are proprietary, but the key idea is this: each tile tracks an assigned range of memory addresses. It does so on behalf of all cores on the chip, maintaining a data structure (tag directory) that tells it which cores are using data from its assigned addresses. Coherence requires both tile-to-tile and tile-to-memory communication. Cores that read or modify data must communicate with the tiles that manage the memory associated with that data. Similarly, when cores need data from main memory, the tile(s) that manage the associated addresses will communicate with the memory controllers on behalf of those cores.</p>
<p>The KNL can do this in several ways, each of which is called a cluster mode. Each cluster mode, specified in the BIOS as a boot-time option, represents a tradeoff between simplicity and control. There are three major cluster modes with a few minor variations:</p>
<ul>
<li><p><strong>All-to-All</strong>. This is the most flexible and most general mode, intended to work on all possible hardware and memory configurations of the KNL. But this mode also may have higher latencies than other cluster modes because the processor does not attempt to optimize coherency-related communication paths. Stampede2 does not have nodes in this cluster mode.</p></li>
<li><p><strong>Quadrant (variation: hemisphere)</strong>. This is Intel's recommended default, and the cluster mode of most Stampede2 queues. This mode attempts to localize communication without requiring explicit memory management by the programmer/user. It does this by grouping tiles into four logical/virtual (not physical) quadrants, then requiring each tile to manage MCDRAM addresses only in its own quadrant (and DDR addresses in its own half of the chip). This reduces the average number of "hops" that tile-to-memory requests require compared to all-to-all mode, which can reduce latency and congestion on the mesh.</p></li>
<li><p><strong>Sub-NUMA 4 (variation: Sub-NUMA 2)</strong>. This mode, abbreviated <strong>SNC-4</strong>, divides the chip into four NUMA nodes so that it acts like a four-socket processor. SNC-4 aims to optimize coherency-related on-chip communication by confining this communication to a single NUMA node when it is possible to do so. To achieve any performance benefit, this requires explicit manual memory management by the programmer/user (in particular, allocating memory within the NUMA node that will use that memory). Stampede2 does not have nodes in this cluster mode.</p></li>
</ul>
</div>
<div id="figure5">
<figure>
<p><img alt="KNL Cluster Modes" src="/documents/10157/1334612/KNL+Cluster+Modes.png/142c5092-c736-48ea-a3cc-f3d9d3f43647" style="width: 800px; height: 274px;"> </p>
<figcaption>
Figure 5. KNL Cluster Modes
</figcaption>
</figure>
<p>TACC's early experience with the KNL suggests that there is little reason to deviate from Intel's recommended default memory and cluster modes. Cache-quadrant tends to be a good choice for almost all workflows; it offers a nice compromise between performance and ease of use for the applications we have tested. Flat-quadrant is the most promising alternative and sometimes offers moderately better performance, especially when memory requirements per node are less than 16GB. We have not yet observed significant performance differences across cluster modes, and our current recommendation is that configurations other than cache-quadrant and flat-quadrant are worth considering only for very specialized needs. For more information see <a href="#knl-programming-managingmemory">Managing Memory</a> and <a href="#knl-programming-bestpractices">Best Known Practices…</a>.</p>
</div>
<div id="programming-knl-managingmemory">
<h3 id="managing-memory"><a href="#programming-knl-managingmemory">Managing Memory</a></h3>
<p>By design, any application can run in any memory and cluster mode, and applications always have access to all available RAM. Moreover, regardless of memory and cluster modes, there are no code changes or other manual interventions required to run your application safely. However, there are times when explicit manual memory management is worth considering to improve performance. The Linux <code>numactl</code> (pronounced "NUMA Control") utility allows you to specify at runtime where your code should allocate memory.</p>
<p>When running in flat-quadrant mode, launch your code with <a href="#example">simple <code>numactl</code> settings</a> to specify whether memory allocations occur in DDR or MCDRAM. See <a href="/training">TACC Training Materials</a> for additional information.</p>
</div>
<div id="example">
<pre class="job-sript">
numactl --membind=0 ./a.out # launch a.out (non-MPI); use DDR (default)
ibrun numactl --membind=0 ./a.out # launch a.out (MPI-based); use DDR (default)
numactl --membind=1 ./a.out # use only MCDRAM
numactl --preferred=1 ./a.out # (<b>RECOMMENDED</b>) MCDRAM if possible; else DDR
numactl --hardware # show numactl settings
numactl --help # list available numactl options</pre>
<p>Examples. Controlling memory in flat-quadrant mode: <code>numactl</code> options</p>
<p>Intel's new <code>memkind</code> library adds the ability to manage memory in source code with a special memory allocator for C code and a corresponding attribute for Fortran. This makes possible a level of control over memory allocation down to the level of the individual data element. As this library matures it will likely become an important tool for those who need fine-grained control of memory.</p>
<p>When you're running MPI codes in the flat-quadrant queue, the <code>mem_affinity</code> script simplifies memory management by calling <code>numactl</code> "under the hood" to make plausible NUMA (Non-Uniform Memory Access) policy choices. For MPI and hybrid applications, the script attempts to ensure that each MPI process uses MCDRAM efficiently. To launch your MPI code with <code>mem_affinity</code>, simply place "<code>mem_affinity</code>" immediately after "<code>ibrun</code>":</p>
<pre><code> ibrun mem_affinity a.out</code></pre>
<p>It's safe to use <code>mem_affinity</code> even when it will have no effect (e.g. cache-quadrant mode). Note that <code>mem_affinity</code> and <code>numactl</code> cannot be used together.</p>
<p>On Stampede2 the keyword "<code>tacc_affinity</code>" was originally an older name for what is now the "<code>mem_affinity</code>" script. To ensure backward compatibility, <code>tacc_affinity</code> is now a symbolic link to <code>mem_affinity</code>. Note that <code>mem_affinity</code> and the symbolic link <code>tacc_affinity</code> do not pin MPI tasks.</p>
</div>
<div id="programming-knl-bestpractices">
<h3 id="best-known-practices-and-preliminary-observations-knl"><a href="#programming-knl-bestpractices">Best Known Practices and Preliminary Observations (KNL)</a></h3>
<p><strong>Hyperthreading. It is rarely a good idea to use all 272 hardware threads simultaneously</strong>, and it's certainly not the first thing you should try. In most cases it's best to specify no more than <span style="white-space: nowrap;">64-68</span> MPI tasks or independent processes per node, and 1-2 threads/core. One exception is worth noting: when calling threaded MKL from a serial code, it's safe to set <code>OMP_NUM_THREADS</code> or <code>MKL_NUM_THREADS</code> to 272. This is because MKL will choose an appropriate thread count less than or equal to the value you specify. See <a href="#mkl-threading">Controlling Threading in MKL</a> for more information. In any case remember that the default value of <code>OMP_NUM_THREADS</code> is 1.</p>
<p><strong>When measuring KNL performance against traditional processors, compare node-to-node rather than core-to-core.</strong> KNL cores run at lower frequencies than traditional multicore processors. Thus, for a fixed number of MPI tasks and threads, a given simulation may run 2-3x slower on KNL than the same submission ran on Stampede1's Sandy Bridge nodes. A well-designed parallel application, however, should be able to run more tasks and/or threads on a KNL node than is possible on Sandy Bridge. If so, it may exhibit better performance per KNL node than it does on Sandy Bridge.</p>
<p><strong>General Expectations</strong>. From a pure hardware perspective, a single Stampede2 KNL node could outperform Stampede1's dual socket Sandy Bridge nodes by as much as 6x; this is true for both memory bandwidth-bound and compute-bound codes. This assumes the code is running out of (fast) MCDRAM on nodes configured in flat mode (450 GB/s bandwidth vs 75 GB/s on Sandy Bridge) or using cache-contained workloads on nodes configured in cache mode (memory footprint &lt; 16GB). It also assumes perfect scalability and no latency issues. In practice we have observed application improvements between 1.3x and 5x for several HPC workloads typically run in TACC systems. Codes with poor vectorization or scalability could see much smaller improvements. In terms of network performance, the Omni-Path network provides 100 Gbits per second peak bandwidth, with point-to-point exchange performance measured at over 11 GBytes per second for a single task pair across nodes. Latency values will be higher than those for the Sandy Bridge FDR Infiniband network: on the order of 2-4 microseconds for exchanges across nodes.</p>
<p><strong>MCDRAM in Flat-Quadrant Mode</strong>. Unless you have specialized needs, we recommend using <code>mem_affinity</code> or launching your application with <span style="white-space: nowrap;">"<code>numactl --preferred=1</code>"</span> when running in flat-quadrant mode (see <a href="#knl-programming-managingmemory">Managing Memory</a> above). If you mistakenly use <span style="white-space: nowrap;">"<code>--membind=1</code>"</span>, only the 16GB of fast MCDRAM will be available. If you mistakenly use <span style="white-space: nowrap;">"<code>--membind=0</code>"</span>, you will not be able to access fast MCDRAM at all.</p>
<p><strong>Task Affinity</strong>. If you're running one threaded, MPI, or hybrid application at a time, default affinity settings are usually sensible and often optimal. See <a href="https://portal.tacc.utexas.edu/training#/session/41">TACC training materials</a> for more information. If you run more than one threaded, MPI, or hybrid application at a time, you'll want to pay attention to affinity. For more information see the appropriate sub-sections under <a href="#running-launching">Launching Applications</a>.</p>
<p><strong>MPI Initialization</strong>. Our preliminary scaling tests with Intel MPI on Stampede2 suggest that the time required to complete MPI initialization scales quadratically with the number of MPI tasks (lower case "<code>-n</code>" in your Slurm submission script) and linearly with the number of nodes (upper case <span style="white-space: nowrap;">"<code>-N</code>"</span>).</p>
<p><strong>Tuning the Performance Scaled Messaging (PSM2) Library</strong>. When running on KNL with MVAPICH2, set the environment variable <code>PSM2_KASSIST_MODE</code> to the value "<code>none</code>" per the <a href="http://mvapich.cse.ohio-state.edu/static/media/mvapich/mvapich2-2.3b-userguide.html#x1-890006.19">MVAPICH2 User Guide</a>. Do not use this environment variable with IMPI; doing so may degrade performance. <span style="color:red">The <code>ibrun</code> launcher will eventually control this environment variable automatically.</span></p>
</div>
</div>
<div id="programming-skx">
<h2 id="programming-and-performance-skx"><a href="#programming-skx">Programming and Performance: SKX</a></h2>
<p><strong>Hyperthreading. It is rarely a good idea to use 96 hardware threads simultaneously</strong>, and it's certainly not the first thing you should try. In most cases it's best to specify no more than 48 MPI tasks or independent processes per node, and 1-2 threads/core. One exception is worth noting: when calling threaded MKL from a serial code, it's safe to set <code>OMP_NUM_THREADS</code> or <code>MKL_NUM_THREADS</code> to 96. This is because MKL will choose an appropriate thread count less than or equal to the value you specify. See <a href="#mkl-threading">Controlling Threading in MKL</a> for more information. In any case remember that the default value of <code>OMP_NUM_THREADS</code> is 1.</p>
</div>
<div id="clockspeeds">
<p><strong>Clock Speed.</strong> The published nominal clock speed of the Stampede2 SKX processors is 2.1GHz. But <a href="https://www.intel.com/content/www/us/en/architecture-and-technology/turbo-boost/turbo-boost-technology.html">actual clock speed varies widely</a>: it depends on the vector instruction set, number of active cores, and other factors affecting power requirements and temperature limits. At one extreme, a single serial application using the <code>AVX2</code> instruction set may run at frequencies approaching 3.7GHz, because it's running on a single core (in fact a single hardware thread). At the other extreme, a large, fully-threaded MKL <code>dgemm</code> (a highly vectorized routine in which all cores operate at nearly full throttle) may run at 1.4GHz.</p>
<p><strong>Vector Optimization and <code>AVX2</code>.</strong> In some cases, using the <code>AVX2</code> instruction set may produce better performance than <code>AVX512</code>. This is largely because cores can run at higher <a href="#clockspeeds">clock speeds</a> when executing <code>AVX2</code> code. To compile for <code>AVX2</code>, replace the <a href="#building-performance-architecture">multi-architecture flags</a> described above with the single flag "<code>-xCORE-AVX2</code>". When you use this flag you will be able to build and run on any Stampede2 node.</p>
<p><strong>Vector Optimization and 512-Bit ZMM Registers.</strong> If your code can take advantage of wide 512-bit vector registers, you may want to try <a href="#building-performance-architecture">compiling for SKX</a> with (for example):</p>
<pre><code>-xCORE-AVX512 -qopt-zmm-usage=high</code></pre>
<p>The <span style="white-space: nowrap;">"<code>qopt-zmm-usage</code>"</span> flag affects the algorithms the compiler uses to decide whether to vectorize a given loop with <code>AVX51</code> intrinsics (wide 512-bit registers) or <code>AVX2</code> code (256-bit registers). When the flag is set to <span style="white-space: nowrap;">"<code>-qopt-zmm-usage=low</code>"</span> (the default when compiling for the SKX using <span style="white-space: nowrap;"><code>CORE-AVX512</code>)</span>, the compiler will choose <code>AVX2</code> code more often; this may or may not be the optimal approach for your application. The <span style="white-space: nowrap;"><code>qopt-zmm-usage</code></span> flag is available only on Intel compilers newer than 17.0.4. Do not use <a href="#building-performance-architecture"><code>$TACC_VEC_FLAGS</code></a> when specifying <span style="white-space: nowrap;"><code>qopt-zmm-usage</code></span>. This is because <code>$TACC_VEC_FLAGS</code> specifies <span style="white-space: nowrap;"><code>AVX2-CORE</code></span> as the base architecture, and the compiler will ignore <span style="white-space: nowrap;"><code>qopt-zmm-usage</code></span> unless the base target is a variant of <code>AVX512</code>. See the recent <a href="https://software.intel.com/en-us/articles/tuning-simd-vectorization-when-targeting-intel-xeon-processor-scalable-family">Intel white paper</a>, the <a href="https://software.intel.com/en-us/cpp-compiler-18.0-developer-guide-and-reference-qopt-zmm-usage-qopt-zmm-usage">compiler documentation</a>, the compiler man pages, and the notes above for more information.</p>
<p><strong>Vector Optimization and <code>COMMON-AVX512</code>.</strong> We have encountered a few complex packages that currently fail to build or run when compiled with <a href="#building-performance-architecture"><code>CORE-AVX512</code></a> (native SKX). In all cases so far, these packages build and run well on both KNL and SKX when compiled as a single-architecture binary with <a href="#building-performance-architecture"><code>-xCOMMON-AVX512</code></a>.</p>
<p><strong>Task Affinity.</strong> If you run one MPI application at a time, the <code>ibrun</code> MPI launcher will spread each node's tasks evenly across an SKX node's two sockets, with consecutive tasks occupying the same socket when possible.</p>
<p><strong>Hardware Thread Numbering.</strong> Execute "<code>lscpu</code>" or "<code>lstopo</code>" on an SKX node to see the numbering scheme for hardware threads. Note that hardware thread numbers alternate between the sockets: even numbered threads are on NUMA node 0, while odd numbered threads are on NUMA node 1. Furthermore, the two hardware threads on a given core have thread numbers that differ by exactly 48 (e.g. threads 3 and 51 are on the same core).</p>
<p><strong>Tuning the Performance Scaled Messaging (PSM2) Library</strong>. When running on SKX with MVAPICH2, setting the environment variable <code>PSM2_KASSIST_MODE</code> to the value "<code>none</code>" may or may not improve performance. For more information see the <a href="http://mvapich.cse.ohio-state.edu/static/media/mvapich/mvapich2-2.3b-userguide.html#x1-890006.19">MVAPICH2 User Guide</a>. Do not use this environment variable with IMPI; doing so may degrade performance. <span style="color:red">The <code>ibrun</code> launcher will eventually control this environment variable automatically.</span></p>
</div>
<div id="programming-fileio">
<h2 id="file-operations-io-performance"><a href="#programming-fileio">File Operations: I/O Performance</a></h2>
<p>This section includes general advice intended to help you achieve good performance during file operations. See <a href="#files-filesystems">Navigating the Shared File Systems</a> for a brief overview of Stampede2's Lustre file systems and the concept of striping. See <a href="https://learn.tacc.utexas.edu/">TACC Training material</a> for additional information on I/O performance.</p>
<p><strong>Follow the advice in <a href="#shared-lustre-file-systems">Good Citizenship</a></strong> to avoid stressing the file system.</p>
<p><strong>Stripe for performance</strong>. If your application writes large files using MPI-based parallel I/O (including <a href="http://mpi-forum.org/docs/mpi-3.1/mpi31-report.pdf">MPI-IO</a>, <a href="https://support.hdfgroup.org/HDF5/PHDF5/">parallel HDF5</a>, and <a href="https://www.unidata.ucar.edu/software/netcdf/docs/parallel_io.html">parallel netCDF</a>, you should experiment with stripe counts larger than the default values (2 stripes on <code>$SCRATCH</code>, 1 stripe on <code>$WORK</code>). See <a href="#files-striping">Striping Large Files</a> for the simplest way to set the stripe count on the directory in which you will create new output files. You may also want to try larger stripe sizes up to 16MB or even 32MB; execute "<code>man lfs</code>" for more information. If you write many small files you should probably leave the stripe count at its default value, especially if you write each file from a single process. Note that it's not possible to change the stripe parameters on files that already exist. This means that you should make decisions about striping when you <em>create</em> input files, not when you read them.</p>
<p><strong>Aggregate file operations</strong>. Open and close files once. Read and write large, contiguous blocks of data at a time; this requires understanding how a given programming language uses memory to <a href="#programming-general-datalocality">store arrays</a>.</p>
<p><strong>Be smart about your general strategy</strong>. When possible avoid an I/O strategy that requires each process to access its own files; such strategies don't scale well and are likely to stress a Lustre file system. A better approach is to use a single process to read and write files. Even better is genuinely parallel MPI-based I/O.</p>
<p><strong>Use parallel I/O libraries</strong>. Leave the details to a high performance package like <a href="http://mpi-forum.org/docs/mpi-3.1/mpi31-report.pdf">MPI-IO</a> (built into MPI itself), <a href="https://support.hdfgroup.org/HDF5/PHDF5/">parallel HDF5</a> <span style="white-space: nowrap;">("<code>module load phdf5</code>")</span>, and <a href="https://www.unidata.ucar.edu/software/netcdf/docs/parallel_io.html">parallel netCDF</a> <span style="white-space: nowrap;">("<code>module load pnetcdf</code>")</span>.</p>
<p>When using the Intel Fortran compiler, <strong>compile with "<a href="https://software.intel.com/en-us/fortran-compiler-18.0-developer-guide-and-reference-assume"><code>-assume buffered_io</code></a>"</strong>. Equivalently, set the environment variable <a href="https://software.intel.com/en-us/node/680054"><code>FORT_BUFFERED=TRUE</code></a>. Doing otherwise can dramatically slow down access to variable length unformatted files. More generally, direct access in Fortran is typically faster than sequential access, and accessing a binary file is faster than ASCII.</p>
</div>
</div>
<div id="help">
<h1 id="help-desk"><a href="#help">Help Desk</a></h1>
<p><a href="https://portal.tacc.utexas.edu/consulting/overview">TACC Consulting</a> operates from 8am to 5pm CST, Monday through Friday, except for holidays. You can <a href="https://portal.tacc.utexas.edu/tacc-consulting/-/consult/tickets/create">submit a help desk ticket</a> at any time via the TACC User Portal with "Stampede2" in the Resource field. Help the consulting staff help you by following these best practices when submitting tickets.</p>
<ul>
<li><p><strong>Do your homework</strong> before submitting a help desk ticket. What does the user guide and other documentation say? Search the internet for key phrases in your error logs; that's probably what the consultants answering your ticket are going to do. What have you changed since the last time your job succeeded?</p></li>
<li><p><strong>Describe your issue as precisely and completely as you can:</strong> what you did, what happened, verbatim error messages, other meaningful output. When appropriate, include the information a consultant would need to find your artifacts and understand your workflow: e.g. the directory containing your build and/or job script; the modules you were using; relevant job numbers; and recent changes in your workflow that could affect or explain the behavior you're observing.</p></li>
<li><p><strong>Subscribe to <a href="https://portal.tacc.utexas.edu/user-news/-/news/Stampede2">Stampede2 User News</a>.</strong> This is the best way to keep abreast of maintenance schedules, system outages, and other general interest items.</p></li>
<li><p><strong>Have realistic expectations.</strong> Consultants can address system issues and answer questions about Stampede2. But they can't teach parallel programming in a ticket, and may know nothing about the package you downloaded. They may offer general advice that will help you build, debug, optimize, or modify your code, but you shouldn't expect them to do these things for you.</p></li>
<li><p><strong>Be patient.</strong> It may take a business day for a consultant to get back to you, especially if your issue is complex. It might take an exchange or two before you and the consultant are on the same page. If the admins disable your account, it's not punitive. When the file system is in danger of crashing, or a login node hangs, they don't have time to notify you before taking action.</p></li>
</ul>
</div>
<div id="refs">
<h1 id="references"><a href="#refs">References</a></h1>
<ul>
<li><a href="https://portal.tacc.utexas.edu/tutorials/bashquickstart">Bash Users' Startup Files: Quick Start Guide</a></li>
<li><a href="http://portal.tacc.utexas.edu/software/idev"><code>idev</code> documentation</a></li>
<li><a href="https://www.gnu.org/doc/doc.en.html">GNU documentation</a></li>
<li><a href="http://software.intel.com/en-us/intel-software-technical-documentation">Intel software documentation</a></li>
<li><a href="http://lmod.readthedocs.org">Lmod's online documentation</a></li>
<li><a href="http://portal.tacc.utexas.edu/tutorials/multifactor-authentication">Multi-Factor Authentication at TACC</a></li>
<li><a href="http://portal.tacc.utexas.edu/tutorials/sharing-project-files">Sharing Project Files on TACC Systems</a></li>
<li><a href="http://www.schedmd.com">Slurm online documentation</a></li>
<li><a href="https://portal.tacc.utexas.edu/training#/guest?training=upcoming">TACC training materials</a></li>
<li><a href="https://vis.tacc.utexas.edu/">TACC Visualization Portal</a></li>
</ul>
</div>
<script type="text/javascript">/*<![CDATA[*/function showhide(){var a=document.getElementById("revisions");if(a.style.display=="block"){document.getElementById("img-arrow").src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000";a.style.display="none"}else{a.style.display="block";document.getElementById("img-arrow").src="/documents/10157/0/small-down-arrow.png/05df5954-1484-4c56-8e54-1c2c0eb501dc?t=1480439900000"}};/*]]>*/</script>
<div id="history">
<h1 id="revision-history"><a href="#history">Revision History</a></h1>
<p>"Last Update" at the top of this document is the date of the most recent change to this document. This revision history is a list of non-trivial updates; it excludes routine items such as corrected typos and minor format changes.</p> <a href="javascript:showhide()"><img src="/documents/10157/0/small-right-arrow.png/32a37818-3255-40f3-bb33-795fac19a3dd?t=1480440057000" id="img-arrow">Click to view</a>
<div id="revisions" style="display:none">
<ul>
<li>04/24/18 Changes to Table 1 and Table 5 associated with new <code>long</code> queue.</li>
<li>04/03/18 Stampede1 decommissioned; removed/revised references to Stampede1 as appropriate.</li>
<li>03/26/18 Corrected and relocated material on <code>qopt-zmm-usage</code>.</li>
<li>02/23/18 New functionality associated with <code>task_affinity</code>, <code>tacc_affinity</code>, and <code>mem_affinity</code> (scripts related to MPI task pinning and KNL memory management).</li>
<li>11/30/17 Initial release supporting Phase 2 (SKX).</li>
<li>08/02/17 Removed references and links to Stampede2 Transition Guide (now deprecated).</li>
<li>06/12/17 Initial public release.</li>
</ul>
</div>
</div>
</div>
<div class="entry-links">
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<div class="portlet-boundary portlet-boundary_1_WAR_kaleodesignerportlet_ portlet-static portlet-static-end portlet-borderless kaleo-designer-portlet " id="p_p_id_1_WAR_kaleodesignerportlet_"> <span id="p_1_WAR_kaleodesignerportlet"></span>
<div class="portlet-body">
</div>
</div>
<form action="#" id="hrefFm" method="post" name="hrefFm"> <span></span>
</form>
</div>
<footer id="footer" role="contentinfo">
<div class="footer-message">
<ul class="inline">
<li><a target="_blank" href="http://www.utexas.edu/research">Office of the Vice President for Research</a></li>
<li><a href="mailto:www@tacc.utexas.edu">Feedback</a></li>
<li><a href="https://portal.tacc.utexas.edu/user-guides/stampede2?p_p_auth=3zGDtOgQ&amp;p_p_id=49&amp;p_p_lifecycle=1&amp;p_p_state=normal&amp;p_p_mode=view&amp;_49_struts_action=%2Fmy_sites%2Fview&amp;_49_groupId=10157&amp;_49_privateLayout=false">Home</a></li>
<li><a target="_blank" href="http://www.facebook.com/tacc.utexas">Facebook</a></li>
<li><a target="_blank" href="http://twitter.com/TACC_Hedda">Twitter</a></li>
<li><a target="_blank" href="http://www.tacc.utexas.edu/about/contact-us">Contact</a></li>
</ul>
<hr style="width:50%">
<p> ©2001-2021 <a target="_blank" href="http://www.tacc.utexas.edu">Texas Advanced Computing Center</a>, <a target="_blank" href="http://www.utexas.edu">The University of Texas at Austin</a> </p>
</div>
</footer>
</div>
<script type="text/javascript">/*<![CDATA[*/Liferay.Util.addInputFocus();Liferay.Portlet.runtimePortletIds=["1_WAR_kaleodesignerportlet","103"];/*]]>*/</script>
<script type="text/javascript">/*<![CDATA[*/Liferay.Portlet.onLoad({canEditTitle:false,columnPos:0,isStatic:"end",namespacedId:"p_p_id_103_",portletId:"103",refreshURL:"\x2fc\x2fportal\x2frender_portlet\x3fp_l_id\x3d1475721\x26p_p_id\x3d103\x26p_p_lifecycle\x3d0\x26p_t_lifecycle\x3d0\x26p_p_state\x3dnormal\x26p_p_mode\x3dview\x26p_p_col_id\x3d\x26p_p_col_pos\x3d0\x26p_p_col_count\x3d0\x26p_p_isolated\x3d1\x26currentURL\x3d\x252Fuser-guides\x252Fstampede2\x253Bjsessionid\x253DAF04850B586BFC725A1DA07C09C26690"});Liferay.Portlet.onLoad({canEditTitle:false,columnPos:0,isStatic:"end",namespacedId:"p_p_id_56_INSTANCE_RYXn3pn9Wi4j_",portletId:"56_INSTANCE_RYXn3pn9Wi4j",refreshURL:"\x2fc\x2fportal\x2frender_portlet\x3fp_l_id\x3d1475721\x26p_p_id\x3d56_INSTANCE_RYXn3pn9Wi4j\x26p_p_lifecycle\x3d0\x26p_p_col_id\x3dcolumn-1\x26p_p_col_count\x3d1\x26p_t_lifecycle\x3d0\x26p_p_state\x3dnormal\x26p_p_mode\x3dview\x26p_p_col_id\x3dcolumn-1\x26p_p_col_pos\x3d0\x26p_p_col_count\x3d1\x26p_p_isolated\x3d1\x26currentURL\x3d\x252Fuser-guides\x252Fstampede2\x253Bjsessionid\x253DAF04850B586BFC725A1DA07C09C26690"});Liferay.provide(Liferay.Util,"openKaleoDesignerPortlet",function(c){var b=AUI();var e=Liferay.Util.getPortletNamespace("2_WAR_kaleodesignerportlet");var g=[];b.Object.each({availableFields:c.availableFields,availablePropertyModels:c.availablePropertyModels,ddmStructureId:c.ddmStructureId,draftVersion:c.draftVersion,kaleoProcessId:c.kaleoProcessId,name:c.name,openerWindowName:c.openerWindowName,portletResourceNamespace:c.portletResourceNamespace,propertiesSaveCallback:c.propertiesSaveCallback,refreshOpenerOnClose:c.refreshOpenerOnClose,saveCallback:c.saveCallback,uiScope:c.uiScope,version:c.version},function(i,h,j){if(i){g.push(e+encodeURIComponent(h)+"="+encodeURIComponent(i))}});c.uri=Liferay.Util.addParams(g.join("&"),c.baseKaleoDesignerURL);var d=c.dialog;if(!d){var f=b.one(Liferay.Util.getOpener()).get("region");d={modal:true,title:c.name,width:f.width*0.85};c.dialog=d}if(!("align" in d)){d.align=Liferay.Util.Window.ALIGN_CENTER}var a=c.dialogIframe;if(!a){a={closeOnEscape:false};c.dialogIframe=a}Liferay.Util.openWindow(c)},["liferay-portlet-url"]);Liferay.Portlet.onLoad({canEditTitle:false,columnPos:0,isStatic:"end",namespacedId:"p_p_id_1_WAR_kaleodesignerportlet_",portletId:"1_WAR_kaleodesignerportlet",refreshURL:"\x2fc\x2fportal\x2frender_portlet\x3fp_l_id\x3d1475721\x26p_p_auth\x3dT6F1vi1X\x26p_p_id\x3d1_WAR_kaleodesignerportlet\x26p_p_lifecycle\x3d0\x26p_t_lifecycle\x3d0\x26p_p_state\x3dnormal\x26p_p_mode\x3dview\x26p_p_col_id\x3d\x26p_p_col_pos\x3d0\x26p_p_col_count\x3d0\x26p_p_isolated\x3d1\x26currentURL\x3d\x252Fuser-guides\x252Fstampede2\x253Bjsessionid\x253DAF04850B586BFC725A1DA07C09C26690"});AUI().use("aui-base","liferay-menu","liferay-notice","liferay-poller",function(a){(function(){Liferay.Util.addInputType();Liferay.Portlet.ready(function(b,c){Liferay.Util.addInputType(c)})})();(function(){new Liferay.Menu();var b=Liferay.Data.notices;for(var c=1;c<b.length;c++){new Liferay.Notice(b[c])}})()});/*]]>*/</script>
<script src="https://portal.tacc.utexas.edu/portal-theme/js/main.js?browserId=other&amp;minifierType=js&amp;languageId=en_US&amp;b=6120&amp;t=1615492875000" type="text/javascript"></script>
<script type="text/javascript">/*<![CDATA[*//*!
* toc - jQuery Table of Contents Plugin
* v0.3.3
* http://projects.jga.me/toc/
* copyright Greg Allen 2015
* MIT License
*/
!function(b){b.fn.smoothScroller=function(a){a=b.extend({},b.fn.smoothScroller.defaults,a);var d=b(this);return b(a.scrollEl).animate({scrollTop:d.offset().top-b(a.scrollEl).offset().top-a.offset},a.speed,a.ease,function(){var c=d.attr("id");c.length&&(history.pushState?history.pushState(null,null,"#"+c):document.location.hash=c),d.trigger("smoothScrollerComplete")}),this},b.fn.smoothScroller.defaults={speed:400,ease:"swing",scrollEl:"body,html",offset:0},b("body").on("click","[data-smoothscroller]",function(a){a.preventDefault();var d=b(this).attr("href");0===d.indexOf("#")&&b(d).smoothScroller()})}(jQuery),function(d){var c={};d.fn.toc=function(t){var s,r=this,q=d.extend({},jQuery.fn.toc.defaults,t),p=d(q.container),o=d(q.selectors,p),n=q.activeClass,m=function(){var e=[];return o.each(function(h,g){var b=d(g);e.push(b.offset().top-q.highlightOffset)}),e},l=function(e,h){if(q.smoothScrolling&&"function"==typeof q.smoothScrolling){e.preventDefault();var g=d(e.target).attr("href");q.smoothScrolling(g,q,h)}d("li",r).removeClass(n),d(e.target).parent().addClass(n)},a=function(e){s&&clearTimeout(s),s=setTimeout(function(){for(var A,z=d(window).scrollTop(),y=d(window).height(),x=Number.MAX_VALUE,w=0,v=m(),u=0,i=v.length;i>u;u++){var h=Math.abs(v[u]-z);x>h&&v[u]<z+y&&(w=u,x=h)}d("li",r).removeClass(n),A=d("li:eq("+w+")",r).addClass(n),q.onHighlight(A)},50)};return q.highlightOnScroll&&(d(window).bind("scroll",a),a()),this.each(function(){var e=d(this),f=d(q.listType);o.each(function(w,v){var u=d(v),k=q.anchorName(w,v,q.prefix);if(v.id!==k){d("<span/>").attr("id",k).insertBefore(u)}var j=d("<a/>").text(q.headerText(w,v,u)).attr("href","#"+k).bind("click",function(g){d(window).unbind("scroll",a),l(g,function(){d(window).bind("scroll",a)}),e.trigger("selected",d(this).attr("href"))}),b=d("<li/>").addClass(q.itemClass(w,v,u,q.prefix)).append(j);f.append(b)}),e.html(f)})},jQuery.fn.toc.defaults={container:"body",listType:"<ul/>",selectors:"h1,h2,h3",smoothScrolling:function(a,f,e){d(a).smoothScroller({offset:f.scrollToOffset}).on("smoothScrollerComplete",function(){e()})},scrollToOffset:0,prefix:"toc",activeClass:"toc-active",onHighlight:function(){},highlightOnScroll:!0,highlightOffset:100,anchorName:function(j,i,h){if(i.id.length){return i.id}var b=d(i).text().replace(/[^a-z0-9]/gi," ").replace(/\s+/g,"-").toLowerCase();if(c[b]){for(var a=2;c[b+a];){a++}b=b+"-"+a}return c[b]=!0,h+"-"+b},headerText:function(f,e,g){return g.data("toc-title")||g.text()},itemClass:function(f,e,h,g){return g+"-"+h[0].tagName.toLowerCase()}}}(jQuery);(function(a,b,c){b(function(){var g=b(".enable-toc");if(g.length>0){b("body").addClass("with-toc");var e=b("h1,h2,h3,h4,h5,h6",g).eq(0);var f=b('<div id="toc">');e.after(f);f.toc({container:g});var d=b('<button name="toc-toggle" class="toc-toggle">');d.html("<span>Table of Contents</span>");e.after(d);d.on("click",function(){b("body").toggleClass("with-toc")})}})})(window,jQuery);$("#toc").toc({selectors:"h1,h2,h3,h4,h5",container:"body",smoothScrolling:true,prefix:"toc",onHighlight:function(a){},highlightOnScroll:true,highlightOffset:100,anchorName:function(a,c,b){return b+a},headerText:function(a,c,b){return b.text()},itemClass:function(a,d,b,c){return b[0].tagName.toLowerCase()}});/*]]>*/</script>
</body>
</html>