Thursday, October 29, 2009

PSI-BLAST and BLAST background probabilities

This post is not directly related to MATLAB but I felt it was important to post this.

I recently realized that it is not trivial to find the background amino acid probabilities that are used in BLAST and PSI-BLAST. Google didn't find it. None of the papers referenced in the BLAST papers actually have the frequencies in a tabular form. I would have thought this should have been documented by NCBI in BLAST help or something! Anyway after a few hours of searching and reading papers and eventually code, I found the actual values used. They can be found in this file

http://www.ncbi.nlm.nih.gov/IEB/ToolBox/C_DOC/lxr/source/tools/blastkar.c

Below are the tables which contain the frequencies. They need to be normalized (divide by the sum of the frequencies = 1000) to convert the frequencies to probabilities.

Google doc spreadsheet

NOTE: PSI-BLAST uses the Robinson values by default

2345 #if STD_AMINO_ACID_FREQS == Dayhoff_prob
2346 /* M. O. Dayhoff amino acid background frequencies */
2347 static BLAST_LetterProb Dayhoff_prob[] = {
2348 { 'A', 87.13 },
2349 { 'C', 33.47 },
2350 { 'D', 46.87 },
2351 { 'E', 49.53 },
2352 { 'F', 39.77 },
2353 { 'G', 88.61 },
2354 { 'H', 33.62 },
2355 { 'I', 36.89 },
2356 { 'K', 80.48 },
2357 { 'L', 85.36 },
2358 { 'M', 14.75 },
2359 { 'N', 40.43 },
2360 { 'P', 50.68 },
2361 { 'Q', 38.26 },
2362 { 'R', 40.90 },
2363 { 'S', 69.58 },
2364 { 'T', 58.54 },
2365 { 'V', 64.72 },
2366 { 'W', 10.49 },
2367 { 'Y', 29.92 }
2368 };
2369 #endif
2370
2371 #if STD_AMINO_ACID_FREQS == Altschul_prob
2372 /* Stephen Altschul amino acid background frequencies */
2373 static BLAST_LetterProb Altschul_prob[] = {
2374 { 'A', 81.00 },
2375 { 'C', 15.00 },
2376 { 'D', 54.00 },
2377 { 'E', 61.00 },
2378 { 'F', 40.00 },
2379 { 'G', 68.00 },
2380 { 'H', 22.00 },
2381 { 'I', 57.00 },
2382 { 'K', 56.00 },
2383 { 'L', 93.00 },
2384 { 'M', 25.00 },
2385 { 'N', 45.00 },
2386 { 'P', 49.00 },
2387 { 'Q', 39.00 },
2388 { 'R', 57.00 },
2389 { 'S', 68.00 },
2390 { 'T', 58.00 },
2391 { 'V', 67.00 },
2392 { 'W', 13.00 },
2393 { 'Y', 32.00 }
2394 };
2395 #endif
2396
2397 #if STD_AMINO_ACID_FREQS == Robinson_prob
2398 /* amino acid background frequencies from Robinson and Robinson */
2399 static BLAST_LetterProb Robinson_prob[] = {
2400 { 'A', 78.05 },
2401 { 'C', 19.25 },
2402 { 'D', 53.64 },
2403 { 'E', 62.95 },
2404 { 'F', 38.56 },
2405 { 'G', 73.77 },
2406 { 'H', 21.99 },
2407 { 'I', 51.42 },
2408 { 'K', 57.44 },
2409 { 'L', 90.19 },
2410 { 'M', 22.43 },
2411 { 'N', 44.87 },
2412 { 'P', 52.03 },
2413 { 'Q', 42.64 },
2414 { 'R', 51.29 },
2415 { 'S', 71.20 },
2416 { 'T', 58.41 },
2417 { 'V', 64.41 },
2418 { 'W', 13.30 },
2419 { 'Y', 32.16 }
2420 };
2421 #endif
2422
2423 static BLAST_LetterProb nt_prob[] = {
2424 { 'A', 25.00 },
2425 { 'C', 25.00 },
2426 { 'G', 25.00 },
2427 { 'T', 25.00 }
2428 };

No comments: