Cluster Reliability / Troubleshooting Replication Issues

Code Recap: Identifying Replication Lag Issues

Resize the oplog with replSetResizeOplog

replSetResizeOplog Admin Command

Use the replSetResizeOplog admin command to resize the oplog or its minimum retention period dynamically without restarting the mongod process.

db.adminCommand(
   {
     replSetResizeOplog: <int>,
     size: <double>,
     minRetentionHours: <double>
   }
 )

Set minimum retention hours for the oplog with replSetResizeOplog

Below is an example of a mongod.conf file that has been edited to specify.

Configure the storage.oplogMinRetentionHours setting in mongod.conf

You may also use the storage.oplogMinRetentionHours setting in your mongod.conf file to set a minimum oplog retention period (in hours):

# mongod.conf

# where to write logging data.
systemLog:
  destination: file
  logAppend: true
  path: /var/log/mongodb/mongod.log

# Where and how to store data.
storage:
  dbPath: /var/lib/mongo
     size: <double>
  oplogMinRetentionHours: <double>
   }
 )

Check replica set health and replication lag with rs.status() and rs.printSecondaryReplicationInfo()

rs.status() Method

Use the rs.status() method to return the replica set status from the point of view of the member where the method is run.

rs.status()

Example Output:

{
   "set" : "replset",
   "date" : ISODate("2024-08-15T23:06:13.978Z"),
   "myState" : 1,
   "term" : Long(3),
   "syncSourceHost" : "",
   "syncSourceId" : -1,
   "heartbeatIntervalMillis" : Long(2000),
   "majorityVoteCount" : 2,
   "writeMajorityCount" : 2,
   "votingMembersCount" : 3,
   "writableVotingMembersCount" : 3,
   "optimes" : {
      "lastCommittedOpTime" : {
         "ts" : Timestamp(1723763173, 1),
         "t" : Long(3)
      },
      "lastCommittedWallTime" : ISODate("2024-08-15T23:06:13.978Z"),
      "readConcernMajorityOpTime" : {
         "ts" : Timestamp(1723763173, 1),
         "t" : Long(3)
      },
      "readConcernMajorityWallTime" : ISODate("2024-08-15T23:06:13.978Z"),
      "appliedOpTime" : {
         "ts" : Timestamp(1723763173, 1),
         "t" : Long(3)
      },
      "durableOpTime" : {
         "ts" : Timestamp(1723763173, 1),
         "t" : Long(3)
      },
      "lastAppliedWallTime" : ISODate("2020-03-05T05:24:38.122Z"),
      "lastDurableWallTime" : ISODate("2020-03-05T05:24:38.122Z")
   },
   "lastStableRecoveryTimestamp" : Timestamp(1723763173, 1),
   "electionCandidateMetrics" : {
      "lastElectionReason" : "stepUpRequestSkipDryRun",
      "lastElectionDate" : ISODate("2024-08-15T23:06:13.978Z"),
      "electionTerm" : Long(3),
      "lastCommittedOpTimeAtElection" : {
         "ts" : Timestamp(1723763173, 1),
         "t" : Long(2)
      },
      "lastSeenOpTimeAtElection" : {
         "ts" : Timestamp(1723763173, 1),
         "t" : Long(2)
      },
      "numVotesNeeded" : 2,
      "priorityAtElection" : 1,
      "electionTimeoutMillis" : Long(10000),
      "priorPrimaryMemberId" : 1,
      "numCatchUpOps" : Long(0),
      "newTermStartDate" : ISODate("2024-08-15T23:06:13.978Z"),
      "wMajorityWriteAvailabilityDate" : ISODate("2024-08-15T23:06:13.978Z")
   },
   "electionParticipantMetrics" : {
      "votedForCandidate" : true,
      "electionTerm" : Long(2),
      "lastVoteDate" : ISODate("2024-08-15T23:06:13.978Z"),
      "electionCandidateMemberId" : 1,
      "voteReason" : "",
      "lastAppliedOpTimeAtElection" : {
         "ts" : Timestamp(1723763173, 1),
         "t" : Long(1)
      },
      "maxAppliedOpTimeInSet" : {
         "ts" : Timestamp(1723763173, 1),
         "t" : Long(1)
      },
      "priorityAtElection" : 1
   },

…

   "members" : [
      {
         "_id" : 0,
         "name" : "m1.example.net:27017",
         "health" : 1,
         "state" : 1,
         "stateStr" : "PRIMARY",
         "uptime" : 269,
         "optime" : {
            "ts" : Timestamp(1723763173, 1),
            "t" : Long(3)
         },
        "optimeDurable" : {
            "ts" : Timestamp(1723763173, 1),
            "t" : Long(3)
         },
         "optimeDate" : ISODate("2024-08-15T23:06:13.978Z"),
         "optimeWrittenDate" : ISODate("2024-08-15T23:06:13.978Z"),
         "lastAppliedWallTime": ISODate("2024-08-15T23:06:13.978Z"),
         "lastDurableWallTime": ISODate("2024-08-15T23:06:13.978Z"),
         "syncSourceHost" : "",
         "syncSourceId" : -1,
         "infoMessage" : "",
         "electionTime" : Timestamp(1723763173, 1),
         "electionDate" : ISODate("2024-08-15T23:06:13.978Z"),
         "configVersion" : 1,
         "configTerm" : 0,
         "self" : true,
         "lastHeartbeatMessage" : ""
      },
      {
         "_id" : 1,
         "name" : "m2.example.net:27017",
         "health" : 1,
         "state" : 2,
         "stateStr" : "SECONDARY",
         "uptime" : 266,
         "optime" : {
            "ts" : Timestamp(1723763173, 1),
            "t" : Long(3)
         },
         "optimeDurable" : {
            "ts" : Timestamp(1723763173, 1),
            "t" : Long(3)
         },
         "optimeDate" : ISODate("2024-08-15T23:06:13.978Z"),
         "optimeDurableDate" : ISODate("2024-08-15T23:06:13.978Z"),
         "lastAppliedWallTime": ISODate("2024-08-15T23:06:13.978Z"),
         "lastDurableWallTime": ISODate("2024-08-15T23:06:13.978Z"),
         "lastHeartbeat" : ISODate("2024-08-15T23:06:13.978Z"),
         "lastHeartbeatRecv" : ISODate("2024-08-15T23:06:13.978Z"),
         "pingMs" : Long(0),
         "lastHeartbeatMessage" : "",
         "syncSourceHost" : "m3.example.net:27017",
         "syncSourceId" : 2,
         "infoMessage" : "",
         "configVersion" : 1
      },
      {
         "_id" : 2,
         "name" : "m3.example.net:27017",
         "health" : 1,
         "state" : 2,
         "stateStr" : "SECONDARY",
         "uptime" : 266,
         "optime" : {
            "ts" : Timestamp(1723763173, 1),
            "t" : Long(3)
         },
         "optimeDurable" : {
            "ts" : Timestamp(1723763173, 1),
            "t" : Long(3)
         },
         "optimeDate" : ISODate("2024-08-15T23:06:13.978Z"),
         "optimeDurableDate" : ISODate("2024-08-15T23:06:13.978Z"),
         "lastAppliedWallTime": ISODate("2024-08-15T23:06:13.978Z"),
         "lastDurableWallTime": ISODate("2024-08-15T23:06:13.978Z"),
         "lastHeartbeat" : ISODate("2024-08-15T23:06:13.978Z"),
         "lastHeartbeatRecv" : ISODate("2024-08-15T23:06:13.978Z"),
         "pingMs" : Long(0),
         "lastHeartbeatMessage" : "",
         "syncSourceHost" : "m1.example.net:27017",
         "syncSourceId" : 0,
         "infoMessage" : "",
         "configVersion" : 1
      }
   ],
   "ok" : 1,
   "$clusterTime" : {
      "clusterTime" : Timestamp(1723763173, 1),
      "signature" : {
         "hash" : BinData(0,"9C2qcGVkipEGJW3iF90qxb/gIwc="),
         "keyId" : Long("6800589497806356482")
      }
   },
   "operationTime" : Timestamp(1723763173, 1)
}

rs.printSecondaryReplicationInfo() Method

Use the rs.printSecondaryReplicationInfo() method to return a formatted report of the replica set status from the perspective of the secondary member of the set. The output is identical to db.printSecondaryReplicationInfo().

rs.printSecondaryReplicationInfo()

Example Output:

source: m1.example.net:27002
    syncedTo: Mon Mar 01 2021 16:30:50 GMT-0800 (PST)
    0 secs (0 hrs) behind the primary
source: m2.example.net:27003
    syncedTo: Mon Mar 01 2021 16:30:50 GMT-0800 (PST)
    0 secs (0 hrs) behind the primary