MongoDb move chunk 故障分析和处理 bug:SERVER-5351
Tue Mar 4 20:34:12 [conn25194] CMD: movechunk: { moveChunk: "archive.archive", to: "shard1_1", find: { uid: -2130706432 } } Tue Mar 4 20:34:12 [conn25194] moving chunk ns: archive.archive moving ( ns:archive.archive at: shard3_2:shard3_2/192.168.130.13:15001,192.168.130.14:15002,10.1.2.13:15003 lastmod: 1|2||000000000000000000000000 min: { uid: -2130706432 } max: { uid: MaxKey }) shard3_2:shard3_2/192.168.130.13:15001,192.168.130.14:15002,10.1.2.13:15003 -> shard1_1:shard1_1/10.1.2.12:13003,192.168.130.11:13001,192.168.130.12:13002 Tue Mar 4 20:34:20 [LockPinger] cluster 192.168.130.14:9001,192.168.130.15:9001,192.168.130.16:9001 pinged successfully at Tue Mar 4 20:34:20 2014 by distributed lock pinger ‘192.168.130.14:9001,192.168.130.15:9001,192.168.130.16:9001/yz-uc-core-a08:27227:1392462352:1804289383‘, sleeping for 30000ms
Wed Mar 5 05:34:57 [conn4754510] moveChunk data transfer progress: { active: true, ns: "archive.archive", from: "shard3_2/192.168.130.13:15001,192.168.130.14:15002,10.1.2.13:15003", min: { uid: -2130706432 }, max: { uid: MaxKey }, shardKeyPattern: { uid: 1 }, state: "catchup", counts: { cloned: 0, clonedBytes: 0, catchup: 0, steady: 0 }, ok: 1.0 } my mem used: 0
Wed Mar 5 06:40:13 [conn4754510] distributed lock ‘archive.archive/yz-uc-core-a03:15001:1349773025:1657874366‘ unlocked. Wed Mar 5 06:40:13 [conn4754510] about to log metadata event: { _id: "yz-uc-core-a03-2014-03-04T22:40:13-163", server: "yz-uc-core-a03", clientAddr: "192.168.130.18:21606", time: new Date(1393972813700), what: "moveChunk.from", ns: "archive.archive", details: { min: { uid: -2130706432 }, max: { uid: MaxKey }, step1 of 6: 1, step2 of 6: 442, step3 of 6: 9, step4 of 6: 36000896, note: "aborted" } } Wed Mar 5 06:40:13 [conn4754510] command admin.$cmd command: { moveChunk: "archive.archive", from: "shard3_2/192.168.130.13:15001,192.168.130.14:15002,10.1.2.13:15003", to: "shard1_1/10.1.2.12:13003,192.168.130.11:13001,192.168.130.12:13002", fromShard: "shard3_2", toShard: "shard1_1", min: { uid: -2130706432 }, max: { uid: MaxKey }, maxChunkSizeBytes: 134217728, shardId: "archive.archive-uid_-2130706432", configdb: "192.168.130.14:9001,192.168.130.15:9001,192.168.130.16:9001", secondaryThrottle: false, $auth: { local: { __system: 2 } } } ntoreturn:1 keyUpdates:0 locks(micros) r:66 w:28 reslen:398 36361617ms
Tue Mar 4 22:14:47 [migrateThread] warning: migrate commit waiting for 3 slaves for ‘archive.archive‘ { uid: -2130706432 } -> { uid: MaxKey } waiting for: 5315c844:6
Wed Mar 5 06:40:13 [migrateThread] about to log metadata event: { _id: "yz-uc-core-a01-2014-03-04T22:40:13-161", server: "yz-uc-core-a01", clientAddr: ":27017", time: new Date(1393972813571), what: "moveChunk.to", ns: "archive.archive", details: { min: { uid: -2130706432 }, max: { uid: MaxKey }, step1 of 5: 9, step2 of 5: 0, step3 of 5: 0, step4 of 5: 0, note: "aborted" } }
Tue Mar 4 22:14:47 [migrateThread] warning: migrate commit waiting for 3 slaves for ‘archive.archive‘ { uid: -2130706432 } -> { uid: MaxKey } waiting for: 5315c844:6
Wed Aug 21 01:31:35 [migrateThread] warning: migrate commit waiting for 2 slaves for ‘post.post‘ { uid: 1107296256 } -> { uid: MaxKey } waiting for: 5213a7f7:4
//d_migrate.cpp class MigrateStatus { void _go() { //...... // pause to wait for replication Timer t; while ( t.minutes() < 600 ) { //10个小时的循环检查同步进度 if ( flushPendingWrites( lastOpApplied ) break; sleepsecs(1); } } bool flushPendingWrites( const ReplTime& lastOpApplied ) { if ( ! opReplicatedEnough( lastOpApplied ) ) { OpTime op( lastOpApplied ); //这里就是打waiting for 3 slaves 日志的地方 OCCASIONALLY warning() << "migrate commit waiting for " << slaveCount << " slaves for ‘" << ns << "‘ " << min << " -> " << max << " waiting for: " << op << migrateLog; return false; } //... } //下面这两个函数检查同步进度,注意不是引用传值, //也就是说, slaveCount是由之前的代码计算好的,传进去opReplicatedEnough之后不会被修改 bool opReplicatedEnough( const ReplTime& lastOpApplied ) { return mongo::opReplicatedEnough( lastOpApplied , slaveCount ); } bool opReplicatedEnough( OpTime op , BSONElement w ) { return slaveTracking.opReplicatedEnough( op , w ); } };
bool _replicatedToNum_slaves_locked(OpTime& op, int numSlaves ) { for ( map<Ident,OpTime>::iterator i=_slaves.begin(); i!=_slaves.end(); i++) { OpTime s = i->second; if ( s < op ) { continue; } if ( --numSlaves == 0 ) return true; } return numSlaves <= 0; }
//repl_block.cpp:111 void reset() { if ( _currentlyUpdatingCache ) return; scoped_lock mylk(_mutex); _slaves.clear(); //清除 } //repl_block.cpp:118 void update( const BSONObj& rid , const string& host , const string& ns , OpTime last ) { REPLDEBUG( host << " " << rid << " " << ns << " " << last ); Ident ident(rid,host,ns); scoped_lock mylk(_mutex); _slaves[ident] = last; //修改 _dirty = true; //..... }
src/mongo/db/repl_block.cpp:115:log(LL_ERROR) << "[yyy] clear _slave in " << " void reset() _slave.clear()\n"; src/mongo/db/repl_block.cpp:126:log(LL_ERROR) << "[yyy] ident:" << rid.toString(false, false) << " " << host << " " << ns << " | last:" << last; src/mongo/db/repl/replset_commands.cpp:215:log(LL_ERROR) << "[yyy] clear _slave in " << " CmdReplSetReconfig"; src/mongo/db/repl/rs_config.cpp:72:log(LL_ERROR) << "[yyy] clear _slave in " << "saveConfigLocally" << rsLog; src/mongo/db/oplog.cpp:50:log(LL_ERROR) << "[yyy] clear _slave in " << " close database";
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。