Calculate corrected theta from resequencing data: Difference between revisions
No edit summary |
No edit summary |
||
(4 intermediate revisions by 2 users not shown) | |||
Line 1: | Line 1: | ||
This procedure will estimate theta (nucleotide diversity) based on re-sequencing data. The method is describe in [http://www.biomedcentral.com/1471-2164/14/148 Esteve-Codina et al.] | |||
<source lang ='bash'> | <source lang ='bash'> | ||
#!/bin/bash | #!/bin/bash | ||
Line 5: | Line 8: | ||
#SBATCH --ntasks=1 | #SBATCH --ntasks=1 | ||
#SBATCH --nodes=1 | #SBATCH --nodes=1 | ||
#SBATCH --constraint= | #SBATCH --constraint=4gpercpu | ||
#SBATCH --output=output_%j.txt | #SBATCH --output=output_%j.txt | ||
#SBATCH --error=error_output_%j.txt | #SBATCH --error=error_output_%j.txt | ||
#SBATCH --job-name=ngstheta | #SBATCH --job-name=ngstheta | ||
module load samtools/0.1.19 | module load samtools/0.1.19 | ||
VAR=`gunzip -c /lustre/nobackup/WUR/ABGC/shared/Pig/vars_hjm_newbuild10_2/vars-flt_$1-final.txt.gz | cut -f8 | head -1000000 | sort | uniq -c | sed 's/^ \+//' | sed 's/ \+/\t/' | sort -k1 -nr | head -1 | cut -f2` | VAR=`gunzip -c /lustre/nobackup/WUR/ABGC/shared/Pig/vars_hjm_newbuild10_2/vars-flt_$1-final.txt.gz | cut -f8 | head -1000000 | sort | uniq -c | sed 's/^ \+//' | sed 's/ \+/\t/' | sort -k1 -nr | head -1 | cut -f2` | ||
Line 24: | Line 26: | ||
</source> | </source> | ||
The script can be submitted using <code>sbatch</code> using the following code, assuming that the names of the individuals are listed in a file called <code>individuals.txt</code>. | |||
<source lang='bash'> | <source lang='bash'> | ||
INDS=`cat | INDS=`cat individuals.txt` | ||
for IND in $INDS; do sbatch nucdiv_pipeline.sh $IND; done | for IND in $INDS; do sbatch nucdiv_pipeline.sh $IND; done | ||
</source> | </source> | ||
Average values for Theta were then extracted with the following R-srcript: | |||
<source lang = 'rsplus'> | <source lang = 'rsplus'> | ||
files=list.files(pattern="wintheta") | files=list.files(pattern="wintheta") | ||
Line 34: | Line 38: | ||
for (file1 in files){ | for (file1 in files){ | ||
x <- read.table(file1,header=T); | x <- read.table(file1,header=T); | ||
mn=mean(x$THETA_HET[x$BP>20000 & x$CHR != 'chrUN_nr']); | mn=mean(x$THETA_HET[x$BP>20000 & x$CHR != 'chrUN_nr' & x$CHR != 'Ssc10_2_X']); | ||
print(paste(file1,mn,sep=" ")); | print(paste(file1,mn,sep=" ")); | ||
a<- rbind(a,data.frame("file"=file1,"theta_het"=mn)) | a<- rbind(a,data.frame("file"=file1,"theta_het"=mn)) |
Latest revision as of 16:00, 15 July 2019
This procedure will estimate theta (nucleotide diversity) based on re-sequencing data. The method is describe in Esteve-Codina et al.
<source lang ='bash'>
- !/bin/bash
- SBATCH --time=10000
- SBATCH --mem=4000
- SBATCH --ntasks=1
- SBATCH --nodes=1
- SBATCH --constraint=4gpercpu
- SBATCH --output=output_%j.txt
- SBATCH --error=error_output_%j.txt
- SBATCH --job-name=ngstheta
module load samtools/0.1.19 VAR=`gunzip -c /lustre/nobackup/WUR/ABGC/shared/Pig/vars_hjm_newbuild10_2/vars-flt_$1-final.txt.gz | cut -f8 | head -1000000 | sort | uniq -c | sed 's/^ \+//' | sed 's/ \+/\t/' | sort -k1 -nr | head -1 | cut -f2` let MAX=2*VAR echo "$1 max_depth is $MAX" MIN=$(( $VAR / 3 )) if [ $MIN -lt 5 ]; then MIN=4; fi
echo "$1 min_depth is $MIN" samtools mpileup -uf /lustre/nobackup/WUR/ABGC/shared/Pig/Sscrofa_build10_2/FASTA/Ssc10_2_chromosomes.fa /lustre/nobackup/WUR/ABGC/shared/Pig/BAM_files_hjm_newbuild10_2/$1_rh.bam | bcftools view -bvcg - > $1.mig.bcf bcftools view $1.mig.bcf | vcfutils.pl varFilter -d$MIN -D$MAX > $1.mig.vcf awk '$6 >= 20' $1.mig.vcf > $1.miguel.vcf samtools mpileup -Bq 20 -d 50000 /lustre/nobackup/WUR/ABGC/shared/Pig/BAM_files_hjm_newbuild10_2/$1_rh.bam | perl covXwin-v3.1.pl -v $1.miguel.vcf -w 50000 -d $MIN -m $MAX -b /lustre/nobackup/WUR/ABGC/shared/Pig/BAM_files_hjm_newbuild10_2/$1_rh.bam | ./ngs_theta -d $MIN -m $MAX > $1.wintheta </source>
The script can be submitted using sbatch
using the following code, assuming that the names of the individuals are listed in a file called individuals.txt
.
<source lang='bash'>
INDS=`cat individuals.txt`
for IND in $INDS; do sbatch nucdiv_pipeline.sh $IND; done
</source>
Average values for Theta were then extracted with the following R-srcript: <source lang = 'rsplus'> files=list.files(pattern="wintheta") a <- data.frame("file" = character(), "theta_het" = numeric()) for (file1 in files){
x <- read.table(file1,header=T); mn=mean(x$THETA_HET[x$BP>20000 & x$CHR != 'chrUN_nr' & x$CHR != 'Ssc10_2_X']); print(paste(file1,mn,sep=" ")); a<- rbind(a,data.frame("file"=file1,"theta_het"=mn))
} write.table(x=a,file="theta_het_results.txt") </source>