mkdir -p data/fastq
mkdir -p data/bam
mkdir -p data/bw
mkdir -p data/peaks

mkdir -p fastqc
mkdir -p src
mkdir -p logs
mkdir -p figures

export HG38="/mnt/storage/r0978323/bowtie_hg38/hg38"
ls -lh $HG38*

export HG38_FASTA="/mnt/storage/r0978323/GCF_000001405.40_GRCh38.p14_genomic.fna"
head -2 "$HG38_FASTA"

export PATH="/mnt/storage/r0978323/micromamba/envs/project-2/bin:$PATH"
which cutadapt
which bamCoverage
which macs2

export PATH="/mnt/storage/r0978323/FastQC_v0.12.1:$PATH"
fastqc --version

export PATH="/mnt/storage/r0978323/TrimGalore_v0.6.10:$PATH"
trim_galore --version

export PATH="/mnt/storage/r0978323/bin:$PATH"
faCount

-rw-r--r-- 1 r0978323 domain users 1002M Dec 29 13:32 /mnt/storage/r0978323/bowtie_hg38/hg38.1.bt2
-rw-r--r-- 1 r0978323 domain users  748M Dec 29 13:35 /mnt/storage/r0978323/bowtie_hg38/hg38.2.bt2
-rw-r--r-- 1 r0978323 domain users   18K Dec 29 13:35 /mnt/storage/r0978323/bowtie_hg38/hg38.3.bt2
-rw-r--r-- 1 r0978323 domain users  748M Dec 29 13:27 /mnt/storage/r0978323/bowtie_hg38/hg38.4.bt2
-rw-r--r-- 1 r0978323 domain users 1002M Dec 29 13:42 /mnt/storage/r0978323/bowtie_hg38/hg38.rev.1.bt2
-rw-r--r-- 1 r0978323 domain users  748M Dec 29 13:38 /mnt/storage/r0978323/bowtie_hg38/hg38.rev.2.bt2
>NC_000001.11 Homo sapiens chromosome 1, GRCh38.p14 Primary Assembly
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
/mnt/storage/r0978323/micromamba/envs/project-2/bin/cutadapt
/mnt/storage/r0978323/micromamba/envs/project-2/bin/bamCoverage
/mnt/storage/r0978323/micromamba/envs/project-2/bin/macs2
FastQC v0.12.1

                        Quality-/Adapter-/RRBS-/Speciality-Trimming
                                [powered by Cutadapt]
                                  version 0.6.10

                               Last update: 02 02 2023

faCount - count base statistics and CpGs in FA files.
usage:
   faCount file(s).fa
     -summary  show only summary statistics
     -dinuc    include statistics on dinucletoide frequencies
     -strands  count bases on both strands


export accession="SRR27667476"

echo "> Downloading fastq files..."
time fastq-dump -v --split-3 --outdir data/fastq/ "$accession"

echo "> Reads in downloaded fastq files:"
echo $(( $(wc -l "data/fastq/${accession}_1.fastq" | cut -d' ' -f1) / 4 ))
echo $(( $(wc -l "data/fastq/${accession}_2.fastq" | cut -d' ' -f1) / 4 ))

> Downloading fastq files...
Preference setting is: Prefer SRA Normalized Format files with full base quality scores if available.
SRR27667476 is an SRA Normalized Format file with full base quality scores.
Read 21073282 spots for SRR27667476
Written 21073282 spots for SRR27667476

real	14m22.931s
user	4m28.895s
sys	0m23.437s
> Reads in downloaded fastq files:
21073282
21073282


# FastQC pre-trim
mkdir -p "fastqc/${accession}_1"
time fastqc "data/fastq/${accession}_1.fastq" -o "fastqc/${accession}_1"
mkdir -p "fastqc/${accession}_2"
time fastqc "data/fastq/${accession}_2.fastq" -o "fastqc/${accession}_2"

null
Started analysis of SRR27667476_1.fastq
Approx 5% complete for SRR27667476_1.fastq
Approx 10% complete for SRR27667476_1.fastq
Approx 15% complete for SRR27667476_1.fastq
Approx 20% complete for SRR27667476_1.fastq
Approx 25% complete for SRR27667476_1.fastq
Approx 30% complete for SRR27667476_1.fastq
Approx 35% complete for SRR27667476_1.fastq
Approx 40% complete for SRR27667476_1.fastq
Approx 45% complete for SRR27667476_1.fastq
Approx 50% complete for SRR27667476_1.fastq
Approx 55% complete for SRR27667476_1.fastq
Approx 60% complete for SRR27667476_1.fastq
Approx 65% complete for SRR27667476_1.fastq
Approx 70% complete for SRR27667476_1.fastq
Approx 75% complete for SRR27667476_1.fastq
Approx 80% complete for SRR27667476_1.fastq
Approx 85% complete for SRR27667476_1.fastq
Approx 90% complete for SRR27667476_1.fastq
Approx 95% complete for SRR27667476_1.fastq
Analysis complete for SRR27667476_1.fastq

real	2m15.447s
user	2m18.338s
sys	0m4.499s
null
Started analysis of SRR27667476_2.fastq
Approx 5% complete for SRR27667476_2.fastq
Approx 10% complete for SRR27667476_2.fastq
Approx 15% complete for SRR27667476_2.fastq
Approx 20% complete for SRR27667476_2.fastq
Approx 25% complete for SRR27667476_2.fastq
Approx 30% complete for SRR27667476_2.fastq
Approx 35% complete for SRR27667476_2.fastq
Approx 40% complete for SRR27667476_2.fastq
Approx 45% complete for SRR27667476_2.fastq
Approx 50% complete for SRR27667476_2.fastq
Approx 55% complete for SRR27667476_2.fastq
Approx 60% complete for SRR27667476_2.fastq
Approx 65% complete for SRR27667476_2.fastq
Approx 70% complete for SRR27667476_2.fastq
Approx 75% complete for SRR27667476_2.fastq
Approx 80% complete for SRR27667476_2.fastq
Approx 85% complete for SRR27667476_2.fastq
Approx 90% complete for SRR27667476_2.fastq
Approx 95% complete for SRR27667476_2.fastq
Analysis complete for SRR27667476_2.fastq

real	2m10.437s
user	2m14.854s
sys	0m4.254s


set -e

# Use trim_galore to produce a pair of trimmed read files
cd data/fastq
trim_galore --paired "${accession}_1.fastq" "${accession}_2.fastq"
# Creates "${accession}_{1/2}_val_{1/2}.fq" inside the data/fastq directory
cd ../..

# We'll only keep the trimmed fastq files:
rm "data/fastq/${accession}_1.fastq"
rm "data/fastq/${accession}_2.fastq"

# Let's pick a clearer name and a consistent extension:
mv "data/fastq/${accession}_1_val_1.fq" "data/fastq/${accession}_trimmed_1.fastq"
mv "data/fastq/${accession}_2_val_2.fq" "data/fastq/${accession}_trimmed_2.fastq"

# FastQC post-trim:
mkdir -p "fastqc/${accession}_trimmed_1"
time fastqc "data/fastq/${accession}_trimmed_1.fastq" -o "fastqc/${accession}_trimmed_1"
mkdir -p "fastqc/${accession}_trimmed_2"
time fastqc "data/fastq/${accession}_trimmed_2.fastq" -o "fastqc/${accession}_trimmed_2"

Multicore support not enabled. Proceeding with single-core trimming.
Path to Cutadapt set as: 'cutadapt' (default)
Cutadapt seems to be working fine (tested command 'cutadapt --version')
Cutadapt version: 5.0
single-core operation.
Proceeding with 'gzip' for decompression
To decrease CPU usage of decompression, please install 'igzip' and run again

No quality encoding type selected. Assuming that the data provided uses Sanger encoded Phred scores (default)


AUTO-DETECTING ADAPTER TYPE
===========================
Attempting to auto-detect adapter type from the first 1 million sequences of the first file (>> SRR27667476_1.fastq <<)

Found perfect matches for the following adapter sequences:
Adapter type	Count	Sequence	Sequences analysed	Percentage
Illumina	4622	AGATCGGAAGAGC	1000000	0.46
Nextera	16	CTGTCTCTTATA	1000000	0.00
smallRNA	5	TGGAATTCTCGG	1000000	0.00
Using Illumina adapter for trimming (count: 4622). Second best hit was Nextera (count: 16)

Writing report to 'SRR27667476_1.fastq_trimming_report.txt'

SUMMARISING RUN PARAMETERS
==========================
Input filename: SRR27667476_1.fastq
Trimming mode: paired-end
Trim Galore version: 0.6.10
Cutadapt version: 5.0
Number of cores used for trimming: 1
Quality Phred score cutoff: 20
Quality encoding type selected: ASCII+33
Adapter sequence: 'AGATCGGAAGAGC' (Illumina TruSeq, Sanger iPCR; auto-detected)
Maximum trimming error rate: 0.1 (default)
Minimum required adapter overlap (stringency): 1 bp
Minimum required sequence length for both reads before a sequence pair gets removed: 20 bp

Cutadapt seems to be fairly up-to-date (version 5.0). Setting -j 1
Writing final adapter and quality trimmed output to SRR27667476_1_trimmed.fq


  >>> Now performing quality (cutoff '-q 20') and adapter trimming in a single pass for the adapter sequence: 'AGATCGGAAGAGC' from file SRR27667476_1.fastq <<< 
10000000 sequences processed
20000000 sequences processed
This is cutadapt 5.0 with Python 3.11.11
Command line parameters: -j 1 -e 0.1 -q 20 -O 1 -a AGATCGGAAGAGC SRR27667476_1.fastq
Processing single-end reads on 1 core ...

=== Summary ===

Total reads processed:              21,073,282
Reads with adapters:                 7,447,638 (35.3%)
Reads written (passing filters):    21,073,282 (100.0%)

Total basepairs processed: 3,160,992,300 bp
Quality-trimmed:               4,689,157 bp (0.1%)
Total written (filtered):  3,124,823,787 bp (98.9%)

=== Adapter 1 ===

Sequence: AGATCGGAAGAGC; Type: regular 3'; Length: 13; Trimmed: 7447638 times

Minimum overlap: 1
No. of allowed errors:
1-9 bp: 0; 10-13 bp: 1

Bases preceding removed adapters:
  A: 30.4%
  C: 29.6%
  G: 19.6%
  T: 18.7%
  none/other: 1.6%

Overview of removed sequences
length	count	expect	max.err	error counts
1	5114588	5268320.5	0	5114588
2	1460473	1317080.1	0	1460473
3	447163	329270.0	0	447163
4	102185	82317.5	0	102185
5	31400	20579.4	0	31400
6	14745	5144.8	0	14745
7	11798	1286.2	0	11798
8	10847	321.6	0	10847
9	9618	80.4	0	9338 280
10	8856	20.1	1	8357 499
11	8129	5.0	1	7700 429
12	7433	1.3	1	7220 213
13	6563	0.3	1	6411 152
14	6179	0.3	1	6006 173
15	5549	0.3	1	5416 133
16	5246	0.3	1	5066 180
17	5125	0.3	1	4956 169
18	4383	0.3	1	4291 92
19	3881	0.3	1	3784 97
20	3826	0.3	1	3695 131
21	3494	0.3	1	3404 90
22	3101	0.3	1	3024 77
23	3040	0.3	1	2942 98
24	2962	0.3	1	2866 96
25	2791	0.3	1	2706 85
26	2536	0.3	1	2472 64
27	2536	0.3	1	2462 74
28	2400	0.3	1	2323 77
29	2268	0.3	1	2216 52
30	2086	0.3	1	2032 54
31	1857	0.3	1	1808 49
32	1819	0.3	1	1755 64
33	1808	0.3	1	1748 60
34	1687	0.3	1	1639 48
35	1529	0.3	1	1479 50
36	1471	0.3	1	1427 44
37	1525	0.3	1	1444 81
38	1286	0.3	1	1253 33
39	1357	0.3	1	1314 43
40	1145	0.3	1	1076 69
41	1161	0.3	1	1111 50
42	1010	0.3	1	970 40
43	1005	0.3	1	962 43
44	937	0.3	1	900 37
45	798	0.3	1	773 25
46	776	0.3	1	733 43
47	1219	0.3	1	1179 40
48	229	0.3	1	200 29
49	551	0.3	1	513 38
50	671	0.3	1	630 41
51	447	0.3	1	429 18
52	416	0.3	1	396 20
53	477	0.3	1	445 32
54	348	0.3	1	327 21
55	342	0.3	1	329 13
56	354	0.3	1	317 37
57	337	0.3	1	320 17
58	276	0.3	1	254 22
59	304	0.3	1	290 14
60	198	0.3	1	186 12
61	224	0.3	1	200 24
62	148	0.3	1	135 13
63	119	0.3	1	97 22
64	193	0.3	1	177 16
65	172	0.3	1	147 25
66	99	0.3	1	89 10
67	109	0.3	1	87 22
68	209	0.3	1	184 25
69	92	0.3	1	84 8
70	126	0.3	1	112 14
71	72	0.3	1	60 12
72	40	0.3	1	27 13
73	32	0.3	1	15 17
74	35	0.3	1	24 11
75	71	0.3	1	47 24
76	63	0.3	1	45 18
77	62	0.3	1	53 9
78	56	0.3	1	46 10
79	79	0.3	1	42 37
80	60	0.3	1	43 17
81	71	0.3	1	50 21
82	54	0.3	1	30 24
83	51	0.3	1	37 14
84	64	0.3	1	41 23
85	62	0.3	1	36 26
86	48	0.3	1	31 17
87	51	0.3	1	41 10
88	54	0.3	1	39 15
89	52	0.3	1	31 21
90	49	0.3	1	35 14
91	46	0.3	1	30 16
92	45	0.3	1	21 24
93	53	0.3	1	28 25
94	39	0.3	1	25 14
95	25	0.3	1	19 6
96	47	0.3	1	28 19
97	42	0.3	1	26 16
98	54	0.3	1	28 26
99	45	0.3	1	23 22
100	37	0.3	1	21 16
101	44	0.3	1	31 13
102	40	0.3	1	19 21
103	48	0.3	1	27 21
104	34	0.3	1	23 11
105	26	0.3	1	19 7
106	41	0.3	1	20 21
107	29	0.3	1	14 15
108	41	0.3	1	15 26
109	37	0.3	1	17 20
110	28	0.3	1	16 12
111	35	0.3	1	17 18
112	47	0.3	1	20 27
113	49	0.3	1	17 32
114	30	0.3	1	21 9
115	48	0.3	1	18 30
116	26	0.3	1	18 8
117	42	0.3	1	22 20
118	33	0.3	1	19 14
119	120	0.3	1	112 8
120	251	0.3	1	228 23
121	54	0.3	1	38 16
122	102	0.3	1	88 14
123	82	0.3	1	63 19
124	52	0.3	1	38 14
125	37	0.3	1	29 8
126	28	0.3	1	12 16
127	22	0.3	1	12 10
128	30	0.3	1	17 13
129	24	0.3	1	9 15
130	18	0.3	1	11 7
131	39	0.3	1	14 25
132	33	0.3	1	29 4
133	24	0.3	1	10 14
134	19	0.3	1	6 13
135	69	0.3	1	51 18
136	16	0.3	1	6 10
137	78	0.3	1	64 14
138	45	0.3	1	33 12
139	98	0.3	1	87 11
140	40	0.3	1	28 12
141	64	0.3	1	52 12
142	37	0.3	1	25 12
143	64	0.3	1	47 17
144	18	0.3	1	10 8
145	13	0.3	1	2 11
146	72	0.3	1	53 19
147	53	0.3	1	17 36
148	35	0.3	1	9 26
149	299	0.3	1	32 267
150	119433	0.3	1	2050 117383

RUN STATISTICS FOR INPUT FILE: SRR27667476_1.fastq
=============================================
21073282 sequences processed in total
The length threshold of paired-end sequences gets evaluated later on (in the validation step)

Writing report to 'SRR27667476_2.fastq_trimming_report.txt'

SUMMARISING RUN PARAMETERS
==========================
Input filename: SRR27667476_2.fastq
Trimming mode: paired-end
Trim Galore version: 0.6.10
Cutadapt version: 5.0
Number of cores used for trimming: 1
Quality Phred score cutoff: 20
Quality encoding type selected: ASCII+33
Adapter sequence: 'AGATCGGAAGAGC' (Illumina TruSeq, Sanger iPCR; auto-detected)
Maximum trimming error rate: 0.1 (default)
Minimum required adapter overlap (stringency): 1 bp
Minimum required sequence length for both reads before a sequence pair gets removed: 20 bp

Cutadapt seems to be fairly up-to-date (version 5.0). Setting -j -j 1
Writing final adapter and quality trimmed output to SRR27667476_2_trimmed.fq


  >>> Now performing quality (cutoff '-q 20') and adapter trimming in a single pass for the adapter sequence: 'AGATCGGAAGAGC' from file SRR27667476_2.fastq <<< 
10000000 sequences processed
20000000 sequences processed
This is cutadapt 5.0 with Python 3.11.11
Command line parameters: -j 1 -e 0.1 -q 20 -O 1 -a AGATCGGAAGAGC SRR27667476_2.fastq
Processing single-end reads on 1 core ...

=== Summary ===

Total reads processed:              21,073,282
Reads with adapters:                 7,327,702 (34.8%)
Reads written (passing filters):    21,073,282 (100.0%)

Total basepairs processed: 3,160,992,300 bp
Quality-trimmed:               5,059,135 bp (0.2%)
Total written (filtered):  3,141,916,015 bp (99.4%)

=== Adapter 1 ===

Sequence: AGATCGGAAGAGC; Type: regular 3'; Length: 13; Trimmed: 7327702 times

Minimum overlap: 1
No. of allowed errors:
1-9 bp: 0; 10-13 bp: 1

Bases preceding removed adapters:
  A: 30.7%
  C: 30.0%
  G: 20.3%
  T: 18.9%
  none/other: 0.0%

Overview of removed sequences
length	count	expect	max.err	error counts
1	5094679	5268320.5	0	5094679
2	1470917	1317080.1	0	1470917
3	452319	329270.0	0	452319
4	101460	82317.5	0	101460
5	31256	20579.4	0	31256
6	14989	5144.8	0	14989
7	11975	1286.2	0	11975
8	11188	321.6	0	11188
9	9228	80.4	0	8929 299
10	9080	20.1	1	8522 558
11	8051	5.0	1	7667 384
12	7581	1.3	1	7349 232
13	6589	0.3	1	6416 173
14	6509	0.3	1	6324 185
15	5341	0.3	1	5219 122
16	5178	0.3	1	5055 123
17	5945	0.3	1	5790 155
18	3418	0.3	1	3331 87
19	5105	0.3	1	4953 152
20	3184	0.3	1	3110 74
21	3162	0.3	1	3091 71
22	3131	0.3	1	3044 87
23	3033	0.3	1	2950 83
24	3251	0.3	1	3128 123
25	3053	0.3	1	2992 61
26	2297	0.3	1	2239 58
27	2349	0.3	1	2266 83
28	2405	0.3	1	2358 47
29	2257	0.3	1	2191 66
30	2138	0.3	1	2077 61
31	1864	0.3	1	1812 52
32	1809	0.3	1	1766 43
33	1867	0.3	1	1810 57
34	1679	0.3	1	1623 56
35	1613	0.3	1	1558 55
36	1447	0.3	1	1400 47
37	1432	0.3	1	1388 44
38	1392	0.3	1	1345 47
39	1378	0.3	1	1331 47
40	1151	0.3	1	1110 41
41	1053	0.3	1	1019 34
42	1065	0.3	1	1032 33
43	1009	0.3	1	971 38
44	1004	0.3	1	951 53
45	719	0.3	1	699 20
46	784	0.3	1	742 42
47	760	0.3	1	726 34
48	724	0.3	1	691 33
49	605	0.3	1	567 38
50	602	0.3	1	566 36
51	488	0.3	1	468 20
52	480	0.3	1	446 34
53	419	0.3	1	396 23
54	393	0.3	1	362 31
55	405	0.3	1	368 37
56	379	0.3	1	344 35
57	272	0.3	1	249 23
58	319	0.3	1	293 26
59	376	0.3	1	335 41
60	189	0.3	1	161 28
61	212	0.3	1	184 28
62	229	0.3	1	194 35
63	196	0.3	1	163 33
64	195	0.3	1	168 27
65	259	0.3	1	214 45
66	109	0.3	1	84 25
67	121	0.3	1	108 13
68	108	0.3	1	87 21
69	135	0.3	1	114 21
70	124	0.3	1	97 27
71	122	0.3	1	83 39
72	108	0.3	1	86 22
73	107	0.3	1	87 20
74	98	0.3	1	78 20
75	168	0.3	1	136 32
76	112	0.3	1	82 30
77	59	0.3	1	45 14
78	35	0.3	1	17 18
79	69	0.3	1	43 26
80	76	0.3	1	49 27
81	73	0.3	1	53 20
82	80	0.3	1	51 29
83	71	0.3	1	50 21
84	74	0.3	1	55 19
85	73	0.3	1	44 29
86	78	0.3	1	49 29
87	76	0.3	1	52 24
88	57	0.3	1	44 13
89	75	0.3	1	46 29
90	60	0.3	1	42 18
91	75	0.3	1	52 23
92	68	0.3	1	37 31
93	65	0.3	1	47 18
94	62	0.3	1	35 27
95	48	0.3	1	36 12
96	69	0.3	1	45 24
97	59	0.3	1	37 22
98	66	0.3	1	44 22
99	50	0.3	1	34 16
100	50	0.3	1	32 18
101	68	0.3	1	51 17
102	47	0.3	1	25 22
103	65	0.3	1	36 29
104	54	0.3	1	34 20
105	40	0.3	1	27 13
106	52	0.3	1	23 29
107	51	0.3	1	22 29
108	59	0.3	1	25 34
109	35	0.3	1	25 10
110	48	0.3	1	32 16
111	39	0.3	1	22 17
112	41	0.3	1	30 11
113	64	0.3	1	30 34
114	49	0.3	1	34 15
115	50	0.3	1	30 20
116	46	0.3	1	28 18
117	32	0.3	1	15 17
118	40	0.3	1	27 13
119	281	0.3	1	257 24
120	243	0.3	1	221 22
121	60	0.3	1	37 23
122	108	0.3	1	85 23
123	88	0.3	1	73 15
124	62	0.3	1	36 26
125	27	0.3	1	18 9
126	32	0.3	1	16 16
127	23	0.3	1	17 6
128	25	0.3	1	17 8
129	24	0.3	1	9 15
130	31	0.3	1	14 17
131	32	0.3	1	15 17
132	46	0.3	1	26 20
133	24	0.3	1	12 12
134	16	0.3	1	6 10
135	60	0.3	1	51 9
136	18	0.3	1	5 13
137	77	0.3	1	65 12
138	57	0.3	1	42 15
139	100	0.3	1	88 12
140	40	0.3	1	27 13
141	64	0.3	1	53 11
142	31	0.3	1	24 7
143	64	0.3	1	48 16
144	11	0.3	1	10 1
145	10	0.3	1	2 8
146	67	0.3	1	53 14
147	32	0.3	1	17 15
148	20	0.3	1	9 11
149	51	0.3	1	30 21
150	2282	0.3	1	2077 205

RUN STATISTICS FOR INPUT FILE: SRR27667476_2.fastq
=============================================
21073282 sequences processed in total
The length threshold of paired-end sequences gets evaluated later on (in the validation step)

Validate paired-end files SRR27667476_1_trimmed.fq and SRR27667476_2_trimmed.fq
file_1: SRR27667476_1_trimmed.fq, file_2: SRR27667476_2_trimmed.fq


>>>>> Now validing the length of the 2 paired-end infiles: SRR27667476_1_trimmed.fq and SRR27667476_2_trimmed.fq <<<<<
Writing validated paired-end Read 1 reads to SRR27667476_1_val_1.fq
Writing validated paired-end Read 2 reads to SRR27667476_2_val_2.fq

Total number of sequences analysed: 21073282

Number of sequence pairs removed because at least one read was shorter than the length cutoff (20 bp): 123585 (0.59%)

Deleting both intermediate output files SRR27667476_1_trimmed.fq and SRR27667476_2_trimmed.fq

====================================================================================================

null
Started analysis of SRR27667476_trimmed_1.fastq
Approx 5% complete for SRR27667476_trimmed_1.fastq
Approx 10% complete for SRR27667476_trimmed_1.fastq
Approx 15% complete for SRR27667476_trimmed_1.fastq
Approx 20% complete for SRR27667476_trimmed_1.fastq
Approx 25% complete for SRR27667476_trimmed_1.fastq
Approx 30% complete for SRR27667476_trimmed_1.fastq
Approx 35% complete for SRR27667476_trimmed_1.fastq
Approx 40% complete for SRR27667476_trimmed_1.fastq
Approx 45% complete for SRR27667476_trimmed_1.fastq
Approx 50% complete for SRR27667476_trimmed_1.fastq
Approx 55% complete for SRR27667476_trimmed_1.fastq
Approx 60% complete for SRR27667476_trimmed_1.fastq
Approx 65% complete for SRR27667476_trimmed_1.fastq
Approx 70% complete for SRR27667476_trimmed_1.fastq
Approx 75% complete for SRR27667476_trimmed_1.fastq
Approx 80% complete for SRR27667476_trimmed_1.fastq
Approx 85% complete for SRR27667476_trimmed_1.fastq
Approx 90% complete for SRR27667476_trimmed_1.fastq
Approx 95% complete for SRR27667476_trimmed_1.fastq
Analysis complete for SRR27667476_trimmed_1.fastq

real	2m6.431s
user	2m11.074s
sys	0m3.072s
null
Started analysis of SRR27667476_trimmed_2.fastq
Approx 5% complete for SRR27667476_trimmed_2.fastq
Approx 10% complete for SRR27667476_trimmed_2.fastq
Approx 15% complete for SRR27667476_trimmed_2.fastq
Approx 20% complete for SRR27667476_trimmed_2.fastq
Approx 25% complete for SRR27667476_trimmed_2.fastq
Approx 30% complete for SRR27667476_trimmed_2.fastq
Approx 35% complete for SRR27667476_trimmed_2.fastq
Approx 40% complete for SRR27667476_trimmed_2.fastq
Approx 45% complete for SRR27667476_trimmed_2.fastq
Approx 50% complete for SRR27667476_trimmed_2.fastq
Approx 55% complete for SRR27667476_trimmed_2.fastq
Approx 60% complete for SRR27667476_trimmed_2.fastq
Approx 65% complete for SRR27667476_trimmed_2.fastq
Approx 70% complete for SRR27667476_trimmed_2.fastq
Approx 75% complete for SRR27667476_trimmed_2.fastq
Approx 80% complete for SRR27667476_trimmed_2.fastq
Approx 85% complete for SRR27667476_trimmed_2.fastq
Approx 90% complete for SRR27667476_trimmed_2.fastq
Approx 95% complete for SRR27667476_trimmed_2.fastq
Analysis complete for SRR27667476_trimmed_2.fastq

real	2m7.406s
user	2m11.350s
sys	0m3.202s


set -e

accession="SRR27667476"

# We can increase threads to speed up alignment, 
# but we'll use the default 1 to avoid high CPU usage
threads=1

# Align to hg38
echo "> Alignment: Working on ${accession}..."
time bowtie2 \
  -x "$HG38" \
  -p "$threads" \
  -1 "data/fastq/${accession}_trimmed_1.fastq" \
  -2 "data/fastq/${accession}_trimmed_2.fastq" \
  -S "data/bam/${accession}.sam" \
  | tee "logs/${accession}_bowtie.log"

> Alignment: Working on SRR27667476...
20949697 reads; of these:
  20949697 (100.00%) were paired; of these:
    1600333 (7.64%) aligned concordantly 0 times
    15746922 (75.17%) aligned concordantly exactly 1 time
    3602442 (17.20%) aligned concordantly >1 times
    ----
    1600333 pairs aligned concordantly 0 times; of these:
      29836 (1.86%) aligned discordantly 1 time
    ----
    1570497 pairs aligned 0 times concordantly or discordantly; of these:
      3140994 mates make up the pairs; of these:
        2889548 (91.99%) aligned 0 times
        143959 (4.58%) aligned exactly 1 time
        107487 (3.42%) aligned >1 times
93.10% overall alignment rate

real	233m3.746s
user	228m24.300s
sys	4m37.758s


# Convert to a binary file and remove the text one afterwards:
samtools view -S -b "data/bam/${accession}.sam" > "data/bam/${accession}.bam"
rm "data/bam/${accession}.sam"

# Calculate some statistics and save them in a log for later:
echo "> Flagstats for ${accession}.bam:"
samtools flagstat "data/bam/${accession}.bam" | tee "logs/${accession}_flagstats.log"

> Flagstats for SRR27667476.bam:
41899394 + 0 in total (QC-passed reads + QC-failed reads)
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
39009846 + 0 mapped (93.10% : N/A)
41899394 + 0 paired in sequencing
20949697 + 0 read1
20949697 + 0 read2
38698728 + 0 properly paired (92.36% : N/A)
38800354 + 0 with itself and mate mapped
209492 + 0 singletons (0.50% : N/A)
22584 + 0 with mate mapped to a different chr
8734 + 0 with mate mapped to a different chr (mapQ>=5)


echo "> Clean unmapped/secondary reads for ${accession}.bam:"
samtools view -F 'UNMAP,SECONDARY' -O bam -o "data/bam/${accession}.clean.bam" "data/bam/${accession}.bam"

echo "> Flagstats for ${accession}.clean.bam:"
samtools flagstat "data/bam/${accession}.clean.bam" | tee "logs/${accession}_flagstats_clean.log"

> Clean unmapped/secondary reads for SRR27667476.bam:
> Flagstats for SRR27667476.clean.bam:
39009846 + 0 in total (QC-passed reads + QC-failed reads)
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
39009846 + 0 mapped (100.00% : N/A)
39009846 + 0 paired in sequencing
19520576 + 0 read1
19489270 + 0 read2
38698728 + 0 properly paired (99.20% : N/A)
38800354 + 0 with itself and mate mapped
209492 + 0 singletons (0.54% : N/A)
22584 + 0 with mate mapped to a different chr
8734 + 0 with mate mapped to a different chr (mapQ>=5)


echo "> Sort and index ${accession}.clean.bam:"
samtools sort -O bam -o "data/bam/${accession}.sorted.bam" "data/bam/${accession}.clean.bam"
samtools index "data/bam/${accession}.sorted.bam"

# Remove bam files other than the final sorted one
rm "data/bam/${accession}.bam"
rm "data/bam/${accession}.clean.bam"

> Sort and index SRR27667476.clean.bam:
[bam_sort_core] merging from 17 files and 1 in-memory blocks...


accession="SRR27667476"

rm "data/fastq/${accession}_trimmed_1.fastq"
rm "data/fastq/${accession}_trimmed_2.fastq"


accessions=(
  SRR27667476
  SRR27667481
)

# Note: the cell output (and its `time` output) is from threads=4
threads=1

for accession in "${accessions[@]}"; do
  # Download accession
  if [ -f "data/fastq/${accession}_trimmed_1.fastq" ] || [ -f "data/bam/${accession}.sorted.bam" ]; then
    echo "> Download: Skipping $accession, already downloaded or processed to a BAM file"
  else
    time fastq-dump -v --split-3 --outdir data/fastq/ "$accession"

    echo "> Reads in downloaded fastq files:"
    echo $(( $(wc -l "data/fastq/${accession}_1.fastq" | cut -d' ' -f1) / 4 ))
    echo $(( $(wc -l "data/fastq/${accession}_2.fastq" | cut -d' ' -f1) / 4 ))

    # Trim adaptors:
    cd data/fastq
    trim_galore --paired "${accession}_1.fastq" "${accession}_2.fastq"
    # Outputs "${accession}_trimmed.fq"
    cd ../..

    # Only keep the trimmed fastq files:
    rm "data/fastq/${accession}_1.fastq"
    rm "data/fastq/${accession}_2.fastq"
    mv "data/fastq/${accession}_1_val_1.fq" "data/fastq/${accession}_trimmed_1.fastq"
    mv "data/fastq/${accession}_2_val_2.fq" "data/fastq/${accession}_trimmed_2.fastq"

    # FastQC post-trim:
    mkdir -p "fastqc/${accession}_trimmed_1"
    time fastqc "data/fastq/${accession}_trimmed_1.fastq" -o "fastqc/${accession}_trimmed_1"
    mkdir -p "fastqc/${accession}_trimmed_2"
    time fastqc "data/fastq/${accession}_trimmed_2.fastq" -o "fastqc/${accession}_trimmed_2"
  fi

  # Align to hg38
  if [ -f "data/bam/${accession}.sorted.bam" ]; then
    echo "> Alignment: Skipping hg38 alignment for $accession, file exists"
  else
    echo "> Alignment: Working on ${accession}..."
    time bowtie2 \
      -x "$HG38" \
      -p "$threads" \
      -1 "data/fastq/${accession}_trimmed_1.fastq" \
      -2 "data/fastq/${accession}_trimmed_2.fastq" \
      -S "data/bam/${accession}.sam" \
      | tee "logs/${accession}_bowtie.log"

    # Convert to a binary file:
    samtools view -S -b "data/bam/${accession}.sam" > "data/bam/${accession}.bam"
    rm "data/bam/${accession}.sam"

    # Calculate statistics on generated file:
    echo "> Flagstats for ${accession}.bam:"
    samtools flagstat "data/bam/${accession}.bam" | tee "logs/${accession}_flagstats.log"

    echo "> Clean unmapped/secondary reads for ${accession}.bam:"
    samtools view -F 260 -O bam -o "data/bam/${accession}.clean.bam" "data/bam/${accession}.bam"
    
    # Calculate statistics on cleaned file:
    echo "> Flagstats for ${accession}.clean.bam:"
    samtools flagstat "data/bam/${accession}.clean.bam" | tee "logs/${accession}_flagstats_clean.log"

    echo "> Sort and index ${accession}.clean.bam:"
    samtools sort -O bam -o "data/bam/${accession}.sorted.bam" "data/bam/${accession}.clean.bam"
    samtools index "data/bam/${accession}.sorted.bam"

    # Remove bam files other than the final sorted one
    rm "data/bam/${accession}.bam"
    rm "data/bam/${accession}.clean.bam"
  fi

  # If we've reached this point, we should be able to clean up fastq files:
  if [ -f "data/fastq/${accession}_trimmed_1.fastq" ]; then
    rm "data/fastq/${accession}_trimmed_1.fastq"
  fi
  if [ -f "data/fastq/${accession}_trimmed_2.fastq" ]; then
    rm "data/fastq/${accession}_trimmed_2.fastq"
  fi
done

> Download: Skipping SRR27667476, already downloaded or processed to a BAM file
> Alignment: Skipping hg38 alignment for SRR27667476, file exists
Preference setting is: Prefer SRA Normalized Format files with full base quality scores if available.
SRR27667481 is an SRA Normalized Format file with full base quality scores.
Read 23622543 spots for SRR27667481
Written 23622543 spots for SRR27667481

real	14m32.085s
user	5m5.121s
sys	0m27.378s
> Reads in downloaded fastq files:
23622543
23622543
Multicore support not enabled. Proceeding with single-core trimming.
Path to Cutadapt set as: 'cutadapt' (default)
Cutadapt seems to be working fine (tested command 'cutadapt --version')
Cutadapt version: 5.0
single-core operation.
Proceeding with 'gzip' for decompression
To decrease CPU usage of decompression, please install 'igzip' and run again

No quality encoding type selected. Assuming that the data provided uses Sanger encoded Phred scores (default)


AUTO-DETECTING ADAPTER TYPE
===========================
Attempting to auto-detect adapter type from the first 1 million sequences of the first file (>> SRR27667481_1.fastq <<)

Found perfect matches for the following adapter sequences:
Adapter type	Count	Sequence	Sequences analysed	Percentage
Illumina	2208	AGATCGGAAGAGC	1000000	0.22
Nextera	17	CTGTCTCTTATA	1000000	0.00
smallRNA	6	TGGAATTCTCGG	1000000	0.00
Using Illumina adapter for trimming (count: 2208). Second best hit was Nextera (count: 17)

Writing report to 'SRR27667481_1.fastq_trimming_report.txt'

SUMMARISING RUN PARAMETERS
==========================
Input filename: SRR27667481_1.fastq
Trimming mode: paired-end
Trim Galore version: 0.6.10
Cutadapt version: 5.0
Number of cores used for trimming: 1
Quality Phred score cutoff: 20
Quality encoding type selected: ASCII+33
Adapter sequence: 'AGATCGGAAGAGC' (Illumina TruSeq, Sanger iPCR; auto-detected)
Maximum trimming error rate: 0.1 (default)
Minimum required adapter overlap (stringency): 1 bp
Minimum required sequence length for both reads before a sequence pair gets removed: 20 bp

Cutadapt seems to be fairly up-to-date (version 5.0). Setting -j 1
Writing final adapter and quality trimmed output to SRR27667481_1_trimmed.fq


  >>> Now performing quality (cutoff '-q 20') and adapter trimming in a single pass for the adapter sequence: 'AGATCGGAAGAGC' from file SRR27667481_1.fastq <<< 
10000000 sequences processed
20000000 sequences processed
This is cutadapt 5.0 with Python 3.11.11
Command line parameters: -j 1 -e 0.1 -q 20 -O 1 -a AGATCGGAAGAGC SRR27667481_1.fastq
Processing single-end reads on 1 core ...

=== Summary ===

Total reads processed:              23,622,543
Reads with adapters:                 8,138,885 (34.5%)
Reads written (passing filters):    23,622,543 (100.0%)

Total basepairs processed: 3,543,381,450 bp
Quality-trimmed:               6,021,862 bp (0.2%)
Total written (filtered):  3,518,736,174 bp (99.3%)

=== Adapter 1 ===

Sequence: AGATCGGAAGAGC; Type: regular 3'; Length: 13; Trimmed: 8138885 times

Minimum overlap: 1
No. of allowed errors:
1-9 bp: 0; 10-13 bp: 1

Bases preceding removed adapters:
  A: 29.5%
  C: 32.2%
  G: 20.6%
  T: 17.4%
  none/other: 0.4%

Overview of removed sequences
length	count	expect	max.err	error counts
1	5662334	5905635.8	0	5662334
2	1730631	1476408.9	0	1730631
3	489514	369102.2	0	489514
4	99716	92275.6	0	99716
5	27340	23068.9	0	27340
6	8542	5767.2	0	8542
7	5033	1441.8	0	5033
8	4559	360.5	0	4559
9	4507	90.1	0	4198 309
10	4341	22.5	1	3845 496
11	3957	5.6	1	3692 265
12	3550	1.4	1	3427 123
13	3237	0.4	1	3137 100
14	3181	0.4	1	3082 99
15	2987	0.4	1	2900 87
16	2761	0.4	1	2686 75
17	2674	0.4	1	2570 104
18	2562	0.4	1	2490 72
19	2207	0.4	1	2159 48
20	2230	0.4	1	2165 65
21	2081	0.4	1	2018 63
22	1993	0.4	1	1944 49
23	1910	0.4	1	1842 68
24	1816	0.4	1	1725 91
25	1720	0.4	1	1654 66
26	1550	0.4	1	1505 45
27	1608	0.4	1	1548 60
28	1464	0.4	1	1411 53
29	1438	0.4	1	1361 77
30	1295	0.4	1	1255 40
31	1189	0.4	1	1156 33
32	1180	0.4	1	1150 30
33	1088	0.4	1	1053 35
34	1061	0.4	1	1013 48
35	960	0.4	1	913 47
36	921	0.4	1	874 47
37	822	0.4	1	777 45
38	859	0.4	1	789 70
39	797	0.4	1	740 57
40	626	0.4	1	595 31
41	771	0.4	1	713 58
42	583	0.4	1	543 40
43	511	0.4	1	474 37
44	540	0.4	1	509 31
45	491	0.4	1	444 47
46	390	0.4	1	364 26
47	583	0.4	1	547 36
48	120	0.4	1	91 29
49	312	0.4	1	281 31
50	390	0.4	1	347 43
51	303	0.4	1	271 32
52	238	0.4	1	208 30
53	290	0.4	1	256 34
54	205	0.4	1	178 27
55	183	0.4	1	158 25
56	196	0.4	1	172 24
57	219	0.4	1	177 42
58	159	0.4	1	132 27
59	199	0.4	1	171 28
60	115	0.4	1	93 22
61	127	0.4	1	110 17
62	111	0.4	1	94 17
63	76	0.4	1	52 24
64	104	0.4	1	77 27
65	119	0.4	1	94 25
66	53	0.4	1	37 16
67	59	0.4	1	33 26
68	130	0.4	1	101 29
69	61	0.4	1	47 14
70	82	0.4	1	64 18
71	52	0.4	1	31 21
72	50	0.4	1	26 24
73	30	0.4	1	8 22
74	20	0.4	1	12 8
75	37	0.4	1	17 20
76	46	0.4	1	29 17
77	56	0.4	1	34 22
78	49	0.4	1	34 15
79	48	0.4	1	30 18
80	53	0.4	1	30 23
81	63	0.4	1	42 21
82	50	0.4	1	28 22
83	45	0.4	1	24 21
84	54	0.4	1	29 25
85	31	0.4	1	19 12
86	44	0.4	1	28 16
87	48	0.4	1	24 24
88	37	0.4	1	18 19
89	56	0.4	1	36 20
90	52	0.4	1	25 27
91	48	0.4	1	27 21
92	48	0.4	1	23 25
93	45	0.4	1	23 22
94	44	0.4	1	25 19
95	55	0.4	1	25 30
96	40	0.4	1	22 18
97	54	0.4	1	23 31
98	43	0.4	1	25 18
99	43	0.4	1	23 20
100	56	0.4	1	33 23
101	30	0.4	1	16 14
102	42	0.4	1	25 17
103	41	0.4	1	20 21
104	38	0.4	1	15 23
105	43	0.4	1	21 22
106	49	0.4	1	33 16
107	40	0.4	1	16 24
108	42	0.4	1	23 19
109	43	0.4	1	23 20
110	39	0.4	1	20 19
111	43	0.4	1	25 18
112	30	0.4	1	12 18
113	28	0.4	1	15 13
114	35	0.4	1	14 21
115	42	0.4	1	25 17
116	26	0.4	1	10 16
117	32	0.4	1	14 18
118	37	0.4	1	15 22
119	30	0.4	1	15 15
120	56	0.4	1	37 19
121	30	0.4	1	14 16
122	27	0.4	1	17 10
123	26	0.4	1	11 15
124	26	0.4	1	16 10
125	36	0.4	1	19 17
126	18	0.4	1	7 11
127	31	0.4	1	12 19
128	24	0.4	1	7 17
129	23	0.4	1	9 14
130	30	0.4	1	12 18
131	24	0.4	1	6 18
132	25	0.4	1	6 19
133	28	0.4	1	9 19
134	19	0.4	1	8 11
135	26	0.4	1	8 18
136	19	0.4	1	2 17
137	24	0.4	1	15 9
138	16	0.4	1	3 13
139	14	0.4	1	9 5
140	19	0.4	1	5 14
141	27	0.4	1	6 21
142	17	0.4	1	4 13
143	25	0.4	1	6 19
144	18	0.4	1	2 16
145	17	0.4	1	1 16
146	21	0.4	1	8 13
147	19	0.4	1	2 17
148	31	0.4	1	2 29
149	158	0.4	1	23 135
150	35943	0.4	1	417 35526

RUN STATISTICS FOR INPUT FILE: SRR27667481_1.fastq
=============================================
23622543 sequences processed in total
The length threshold of paired-end sequences gets evaluated later on (in the validation step)

Writing report to 'SRR27667481_2.fastq_trimming_report.txt'

SUMMARISING RUN PARAMETERS
==========================
Input filename: SRR27667481_2.fastq
Trimming mode: paired-end
Trim Galore version: 0.6.10
Cutadapt version: 5.0
Number of cores used for trimming: 1
Quality Phred score cutoff: 20
Quality encoding type selected: ASCII+33
Adapter sequence: 'AGATCGGAAGAGC' (Illumina TruSeq, Sanger iPCR; auto-detected)
Maximum trimming error rate: 0.1 (default)
Minimum required adapter overlap (stringency): 1 bp
Minimum required sequence length for both reads before a sequence pair gets removed: 20 bp

Cutadapt seems to be fairly up-to-date (version 5.0). Setting -j -j 1
Writing final adapter and quality trimmed output to SRR27667481_2_trimmed.fq


  >>> Now performing quality (cutoff '-q 20') and adapter trimming in a single pass for the adapter sequence: 'AGATCGGAAGAGC' from file SRR27667481_2.fastq <<< 
10000000 sequences processed
20000000 sequences processed
This is cutadapt 5.0 with Python 3.11.11
Command line parameters: -j 1 -e 0.1 -q 20 -O 1 -a AGATCGGAAGAGC SRR27667481_2.fastq
Processing single-end reads on 1 core ...

=== Summary ===

Total reads processed:              23,622,543
Reads with adapters:                 8,092,617 (34.3%)
Reads written (passing filters):    23,622,543 (100.0%)

Total basepairs processed: 3,543,381,450 bp
Quality-trimmed:               6,433,753 bp (0.2%)
Total written (filtered):  3,523,492,216 bp (99.4%)

=== Adapter 1 ===

Sequence: AGATCGGAAGAGC; Type: regular 3'; Length: 13; Trimmed: 8092617 times

Minimum overlap: 1
No. of allowed errors:
1-9 bp: 0; 10-13 bp: 1

Bases preceding removed adapters:
  A: 29.5%
  C: 32.2%
  G: 21.0%
  T: 17.4%
  none/other: 0.0%

Overview of removed sequences
length	count	expect	max.err	error counts
1	5637410	5905635.8	0	5637410
2	1737225	1476408.9	0	1737225
3	494113	369102.2	0	494113
4	99695	92275.6	0	99695
5	27592	23068.9	0	27592
6	8705	5767.2	0	8705
7	5202	1441.8	0	5202
8	4918	360.5	0	4918
9	4143	90.1	0	3851 292
10	4496	22.5	1	3972 524
11	3889	5.6	1	3610 279
12	3626	1.4	1	3480 146
13	3223	0.4	1	3107 116
14	3432	0.4	1	3302 130
15	2771	0.4	1	2659 112
16	2770	0.4	1	2681 89
17	3125	0.4	1	3007 118
18	2002	0.4	1	1933 69
19	2817	0.4	1	2711 106
20	1928	0.4	1	1863 65
21	1961	0.4	1	1888 73
22	2043	0.4	1	1966 77
23	1914	0.4	1	1835 79
24	1987	0.4	1	1877 110
25	1926	0.4	1	1865 61
26	1420	0.4	1	1369 51
27	1461	0.4	1	1398 63
28	1532	0.4	1	1474 58
29	1318	0.4	1	1267 51
30	1446	0.4	1	1386 60
31	1108	0.4	1	1055 53
32	1300	0.4	1	1217 83
33	1081	0.4	1	1025 56
34	1069	0.4	1	1029 40
35	883	0.4	1	849 34
36	903	0.4	1	863 40
37	891	0.4	1	831 60
38	805	0.4	1	774 31
39	856	0.4	1	794 62
40	747	0.4	1	694 53
41	676	0.4	1	637 39
42	564	0.4	1	518 46
43	516	0.4	1	486 30
44	566	0.4	1	524 42
45	421	0.4	1	387 34
46	466	0.4	1	416 50
47	387	0.4	1	346 41
48	367	0.4	1	334 33
49	372	0.4	1	327 45
50	364	0.4	1	335 29
51	319	0.4	1	295 24
52	282	0.4	1	258 24
53	288	0.4	1	262 26
54	220	0.4	1	184 36
55	234	0.4	1	185 49
56	242	0.4	1	191 51
57	154	0.4	1	132 22
58	204	0.4	1	162 42
59	268	0.4	1	200 68
60	132	0.4	1	90 42
61	155	0.4	1	127 28
62	198	0.4	1	148 50
63	106	0.4	1	79 27
64	107	0.4	1	66 41
65	225	0.4	1	161 64
66	94	0.4	1	64 30
67	88	0.4	1	60 28
68	72	0.4	1	47 25
69	106	0.4	1	74 32
70	129	0.4	1	92 37
71	92	0.4	1	60 32
72	72	0.4	1	48 24
73	76	0.4	1	48 28
74	90	0.4	1	67 23
75	127	0.4	1	88 39
76	81	0.4	1	55 26
77	60	0.4	1	39 21
78	34	0.4	1	18 16
79	77	0.4	1	34 43
80	68	0.4	1	43 25
81	69	0.4	1	44 25
82	75	0.4	1	40 35
83	71	0.4	1	44 27
84	70	0.4	1	40 30
85	54	0.4	1	25 29
86	85	0.4	1	55 30
87	75	0.4	1	47 28
88	58	0.4	1	29 29
89	84	0.4	1	49 35
90	76	0.4	1	42 34
91	70	0.4	1	46 24
92	77	0.4	1	38 39
93	72	0.4	1	35 37
94	64	0.4	1	43 21
95	75	0.4	1	45 30
96	61	0.4	1	34 27
97	78	0.4	1	41 37
98	65	0.4	1	42 23
99	76	0.4	1	52 24
100	65	0.4	1	39 26
101	69	0.4	1	28 41
102	57	0.4	1	36 21
103	69	0.4	1	36 33
104	53	0.4	1	28 25
105	52	0.4	1	33 19
106	80	0.4	1	47 33
107	43	0.4	1	31 12
108	69	0.4	1	40 29
109	53	0.4	1	36 17
110	64	0.4	1	36 28
111	58	0.4	1	29 29
112	57	0.4	1	25 32
113	38	0.4	1	22 16
114	50	0.4	1	27 23
115	57	0.4	1	37 20
116	46	0.4	1	18 28
117	36	0.4	1	18 18
118	63	0.4	1	27 36
119	118	0.4	1	83 35
120	64	0.4	1	46 18
121	44	0.4	1	20 24
122	44	0.4	1	20 24
123	39	0.4	1	15 24
124	33	0.4	1	15 18
125	39	0.4	1	23 16
126	25	0.4	1	10 15
127	28	0.4	1	15 13
128	18	0.4	1	8 10
129	18	0.4	1	11 7
130	31	0.4	1	14 17
131	25	0.4	1	7 18
132	24	0.4	1	6 18
133	15	0.4	1	8 7
134	23	0.4	1	7 16
135	25	0.4	1	9 16
136	8	0.4	1	2 6
137	26	0.4	1	15 11
138	19	0.4	1	4 15
139	24	0.4	1	9 15
140	12	0.4	1	5 7
141	21	0.4	1	7 14
142	10	0.4	1	5 5
143	27	0.4	1	5 22
144	17	0.4	1	2 15
145	11	0.4	1	1 10
146	16	0.4	1	8 8
147	17	0.4	1	3 14
148	12	0.4	1	2 10
149	34	0.4	1	18 16
150	484	0.4	1	422 62

RUN STATISTICS FOR INPUT FILE: SRR27667481_2.fastq
=============================================
23622543 sequences processed in total
The length threshold of paired-end sequences gets evaluated later on (in the validation step)

Validate paired-end files SRR27667481_1_trimmed.fq and SRR27667481_2_trimmed.fq
file_1: SRR27667481_1_trimmed.fq, file_2: SRR27667481_2_trimmed.fq


>>>>> Now validing the length of the 2 paired-end infiles: SRR27667481_1_trimmed.fq and SRR27667481_2_trimmed.fq <<<<<
Writing validated paired-end Read 1 reads to SRR27667481_1_val_1.fq
Writing validated paired-end Read 2 reads to SRR27667481_2_val_2.fq

Total number of sequences analysed: 23622543

Number of sequence pairs removed because at least one read was shorter than the length cutoff (20 bp): 39561 (0.17%)

Deleting both intermediate output files SRR27667481_1_trimmed.fq and SRR27667481_2_trimmed.fq

====================================================================================================

null
Started analysis of SRR27667481_trimmed_1.fastq
Approx 5% complete for SRR27667481_trimmed_1.fastq
Approx 10% complete for SRR27667481_trimmed_1.fastq
Approx 15% complete for SRR27667481_trimmed_1.fastq
Approx 20% complete for SRR27667481_trimmed_1.fastq
Approx 25% complete for SRR27667481_trimmed_1.fastq
Approx 30% complete for SRR27667481_trimmed_1.fastq
Approx 35% complete for SRR27667481_trimmed_1.fastq
Approx 40% complete for SRR27667481_trimmed_1.fastq
Approx 45% complete for SRR27667481_trimmed_1.fastq
Approx 50% complete for SRR27667481_trimmed_1.fastq
Approx 55% complete for SRR27667481_trimmed_1.fastq
Approx 60% complete for SRR27667481_trimmed_1.fastq
Approx 65% complete for SRR27667481_trimmed_1.fastq
Approx 70% complete for SRR27667481_trimmed_1.fastq
Approx 75% complete for SRR27667481_trimmed_1.fastq
Approx 80% complete for SRR27667481_trimmed_1.fastq
Approx 85% complete for SRR27667481_trimmed_1.fastq
Approx 90% complete for SRR27667481_trimmed_1.fastq
Approx 95% complete for SRR27667481_trimmed_1.fastq
Analysis complete for SRR27667481_trimmed_1.fastq

real	2m38.337s
user	2m33.853s
sys	0m6.388s
null
Started analysis of SRR27667481_trimmed_2.fastq
Approx 5% complete for SRR27667481_trimmed_2.fastq
Approx 10% complete for SRR27667481_trimmed_2.fastq
Approx 15% complete for SRR27667481_trimmed_2.fastq
Approx 20% complete for SRR27667481_trimmed_2.fastq
Approx 25% complete for SRR27667481_trimmed_2.fastq
Approx 30% complete for SRR27667481_trimmed_2.fastq
Approx 35% complete for SRR27667481_trimmed_2.fastq
Approx 40% complete for SRR27667481_trimmed_2.fastq
Approx 45% complete for SRR27667481_trimmed_2.fastq
Approx 50% complete for SRR27667481_trimmed_2.fastq
Approx 55% complete for SRR27667481_trimmed_2.fastq
Approx 60% complete for SRR27667481_trimmed_2.fastq
Approx 65% complete for SRR27667481_trimmed_2.fastq
Approx 70% complete for SRR27667481_trimmed_2.fastq
Approx 75% complete for SRR27667481_trimmed_2.fastq
Approx 80% complete for SRR27667481_trimmed_2.fastq
Approx 85% complete for SRR27667481_trimmed_2.fastq
Approx 90% complete for SRR27667481_trimmed_2.fastq
Approx 95% complete for SRR27667481_trimmed_2.fastq
Analysis complete for SRR27667481_trimmed_2.fastq

real	2m36.823s
user	2m34.854s
sys	0m5.994s
> Alignment: Working on SRR27667481...
23582982 reads; of these:
  23582982 (100.00%) were paired; of these:
    790456 (3.35%) aligned concordantly 0 times
    18783653 (79.65%) aligned concordantly exactly 1 time
    4008873 (17.00%) aligned concordantly >1 times
    ----
    790456 pairs aligned concordantly 0 times; of these:
      53542 (6.77%) aligned discordantly 1 time
    ----
    736914 pairs aligned 0 times concordantly or discordantly; of these:
      1473828 mates make up the pairs; of these:
        961844 (65.26%) aligned 0 times
        237482 (16.11%) aligned exactly 1 time
        274502 (18.63%) aligned >1 times
97.96% overall alignment rate

real	74m40.819s
user	294m49.510s
sys	3m5.751s
> Flagstats for SRR27667481.bam:
47165964 + 0 in total (QC-passed reads + QC-failed reads)
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
46204120 + 0 mapped (97.96% : N/A)
47165964 + 0 paired in sequencing
23582982 + 0 read1
23582982 + 0 read2
45585052 + 0 properly paired (96.65% : N/A)
45853102 + 0 with itself and mate mapped
351018 + 0 singletons (0.74% : N/A)
184616 + 0 with mate mapped to a different chr
104038 + 0 with mate mapped to a different chr (mapQ>=5)
> Clean unmapped/secondary reads for SRR27667481.bam:
> Flagstats for SRR27667481.clean.bam:
46204120 + 0 in total (QC-passed reads + QC-failed reads)
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
46204120 + 0 mapped (100.00% : N/A)
46204120 + 0 paired in sequencing
23133225 + 0 read1
23070895 + 0 read2
45585052 + 0 properly paired (98.66% : N/A)
45853102 + 0 with itself and mate mapped
351018 + 0 singletons (0.76% : N/A)
184616 + 0 with mate mapped to a different chr
104038 + 0 with mate mapped to a different chr (mapQ>=5)
> Sort and index SRR27667481.clean.bam:
[bam_sort_core] merging from 20 files and 1 in-memory blocks...


faCount -summary "$HG38_FASTA"

#seq	len	A	C	G	T	N	cpg
total	3298430636	923117203	642552917	645231996	925917038	161611482	32086716
prcnt	1.0  	0.2799	0.1948	0.1956	0.2807	0.0490	0.0097


HG38_EFFECTIVE_SIZE="3136819154"
BIN_SIZE="5"

accessions=(
  SRR27667476
  SRR27667481
)

for accession in "${accessions[@]}"; do
  if [ -f "data/bw/${accession}.bw" ]; then
    echo "> Coverage: Skipping $accession, already processed to bw file"
  else
    time bamCoverage \
      -b "data/bam/${accession}.sorted.bam" \
      --normalizeUsing RPGC \
      --effectiveGenomeSize "$HG38_EFFECTIVE_SIZE" \
      -o "data/bw/${accession}.bw"  \
      -bs "$BIN_SIZE"
  fi
done

normalization: 1x (effective genome size 3136819154)
bamFilesList: ['data/bam/SRR27667476.sorted.bam']
binLength: 5
numberOfSamples: None
blackListFileName: None
skipZeroOverZero: False
bed_and_bin: False
genomeChunkSize: None
defaultFragmentLength: read length
numberOfProcessors: 1
verbose: False
region: None
bedFile: None
minMappingQuality: None
ignoreDuplicates: False
chrsToSkip: []
stepSize: 5
center_read: False
samFlag_include: None
samFlag_exclude: None
minFragmentLength: 0
maxFragmentLength: 0
zerosToNans: False
smoothLength: None
save_data: False
out_file_for_raw_data: None
maxPairedFragmentLength: 1000

real	38m54.654s
user	38m30.003s
sys	0m29.456s
normalization: 1x (effective genome size 3136819154)
bamFilesList: ['data/bam/SRR27667481.sorted.bam']
binLength: 5
numberOfSamples: None
blackListFileName: None
skipZeroOverZero: False
bed_and_bin: False
genomeChunkSize: None
defaultFragmentLength: read length
numberOfProcessors: 1
verbose: False
region: None
bedFile: None
minMappingQuality: None
ignoreDuplicates: False
chrsToSkip: []
stepSize: 5
center_read: False
samFlag_include: None
samFlag_exclude: None
minFragmentLength: 0
maxFragmentLength: 0
zerosToNans: False
smoothLength: None
save_data: False
out_file_for_raw_data: None
maxPairedFragmentLength: 1000

real	43m30.018s
user	43m13.833s
sys	0m23.095s


HG38_EFFECTIVE_SIZE="3136819154"

# Reference: https://www.ncbi.nlm.nih.gov/sra?term=SRX23335288
replicate="SRR27667476"

# Reference: https://www.ncbi.nlm.nih.gov/sra?term=SRX23335283
input="SRR27667481"

time macs2 callpeak \
  -t "data/bam/${replicate}.sorted.bam" \
  -c "data/bam/${input}.sorted.bam" \
  -n data/peaks/ezh2_old \
  -g "$HG38_EFFECTIVE_SIZE" \
  -q 0.05

INFO  @ Fri, 03 Jan 2025 16:08:38: 
# Command line: callpeak -t data/bam/SRR27667476.sorted.bam -c data/bam/SRR27667481.sorted.bam -n data/peaks/ezh2_old -g 3136819154 -q 0.05
# ARGUMENTS LIST:
# name = data/peaks/ezh2_old
# format = AUTO
# ChIP-seq file = ['data/bam/SRR27667476.sorted.bam']
# control file = ['data/bam/SRR27667481.sorted.bam']
# effective genome size = 3.14e+09
# band width = 300
# model fold = [5, 50]
# qvalue cutoff = 5.00e-02
# The maximum gap between significant sites is assigned as the read length/tag size.
# The minimum length of peaks is assigned as the predicted fragment length "d".
# Larger dataset will be scaled towards smaller dataset.
# Range for calculating regional lambda is: 1000 bps and 10000 bps
# Broad region calling is off
# Paired-End mode is off
 
INFO  @ Fri, 03 Jan 2025 16:08:38: #1 read tag files... 
INFO  @ Fri, 03 Jan 2025 16:08:38: #1 read treatment tags... 
INFO  @ Fri, 03 Jan 2025 16:08:38: Detected format is: BAM 
INFO  @ Fri, 03 Jan 2025 16:08:38: * Input file is gzipped. 
INFO  @ Fri, 03 Jan 2025 16:08:44:  1000000 
INFO  @ Fri, 03 Jan 2025 16:08:51:  2000000 
INFO  @ Fri, 03 Jan 2025 16:08:57:  3000000 
INFO  @ Fri, 03 Jan 2025 16:09:03:  4000000 
INFO  @ Fri, 03 Jan 2025 16:09:09:  5000000 
INFO  @ Fri, 03 Jan 2025 16:09:15:  6000000 
INFO  @ Fri, 03 Jan 2025 16:09:21:  7000000 
INFO  @ Fri, 03 Jan 2025 16:09:27:  8000000 
INFO  @ Fri, 03 Jan 2025 16:09:33:  9000000 
INFO  @ Fri, 03 Jan 2025 16:09:39:  10000000 
INFO  @ Fri, 03 Jan 2025 16:09:45:  11000000 
INFO  @ Fri, 03 Jan 2025 16:09:51:  12000000 
INFO  @ Fri, 03 Jan 2025 16:09:56:  13000000 
INFO  @ Fri, 03 Jan 2025 16:10:02:  14000000 
INFO  @ Fri, 03 Jan 2025 16:10:09:  15000000 
INFO  @ Fri, 03 Jan 2025 16:10:14:  16000000 
INFO  @ Fri, 03 Jan 2025 16:10:20:  17000000 
INFO  @ Fri, 03 Jan 2025 16:10:26:  18000000 
INFO  @ Fri, 03 Jan 2025 16:10:33:  19000000 
INFO  @ Fri, 03 Jan 2025 16:10:35: 19349364 reads have been read. 
INFO  @ Fri, 03 Jan 2025 16:10:35: #1.2 read input tags... 
INFO  @ Fri, 03 Jan 2025 16:10:35: Detected format is: BAM 
INFO  @ Fri, 03 Jan 2025 16:10:35: * Input file is gzipped. 
INFO  @ Fri, 03 Jan 2025 16:10:42:  1000000 
INFO  @ Fri, 03 Jan 2025 16:10:48:  2000000 
INFO  @ Fri, 03 Jan 2025 16:10:54:  3000000 
INFO  @ Fri, 03 Jan 2025 16:11:00:  4000000 
INFO  @ Fri, 03 Jan 2025 16:11:06:  5000000 
INFO  @ Fri, 03 Jan 2025 16:11:13:  6000000 
INFO  @ Fri, 03 Jan 2025 16:11:19:  7000000 
INFO  @ Fri, 03 Jan 2025 16:11:25:  8000000 
INFO  @ Fri, 03 Jan 2025 16:11:31:  9000000 
INFO  @ Fri, 03 Jan 2025 16:11:37:  10000000 
INFO  @ Fri, 03 Jan 2025 16:11:43:  11000000 
INFO  @ Fri, 03 Jan 2025 16:11:50:  12000000 
INFO  @ Fri, 03 Jan 2025 16:11:56:  13000000 
INFO  @ Fri, 03 Jan 2025 16:12:02:  14000000 
INFO  @ Fri, 03 Jan 2025 16:12:08:  15000000 
INFO  @ Fri, 03 Jan 2025 16:12:14:  16000000 
INFO  @ Fri, 03 Jan 2025 16:12:20:  17000000 
INFO  @ Fri, 03 Jan 2025 16:12:27:  18000000 
INFO  @ Fri, 03 Jan 2025 16:12:33:  19000000 
INFO  @ Fri, 03 Jan 2025 16:12:39:  20000000 
INFO  @ Fri, 03 Jan 2025 16:12:45:  21000000 
INFO  @ Fri, 03 Jan 2025 16:12:52:  22000000 
INFO  @ Fri, 03 Jan 2025 16:12:57: 22792526 reads have been read. 
INFO  @ Fri, 03 Jan 2025 16:12:57: #1 tag size is determined as 114 bps 
INFO  @ Fri, 03 Jan 2025 16:12:57: #1 tag size = 114.0 
INFO  @ Fri, 03 Jan 2025 16:12:57: #1  total tags in treatment: 19349364 
INFO  @ Fri, 03 Jan 2025 16:12:57: #1 user defined the maximum tags... 
INFO  @ Fri, 03 Jan 2025 16:12:57: #1 filter out redundant tags at the same location and the same strand by allowing at most 1 tag(s) 
INFO  @ Fri, 03 Jan 2025 16:12:58: #1  tags after filtering in treatment: 11434395 
INFO  @ Fri, 03 Jan 2025 16:12:58: #1  Redundant rate of treatment: 0.41 
INFO  @ Fri, 03 Jan 2025 16:12:58: #1  total tags in control: 22792526 
INFO  @ Fri, 03 Jan 2025 16:12:58: #1 user defined the maximum tags... 
INFO  @ Fri, 03 Jan 2025 16:12:58: #1 filter out redundant tags at the same location and the same strand by allowing at most 1 tag(s) 
INFO  @ Fri, 03 Jan 2025 16:12:58: #1  tags after filtering in control: 16536128 
INFO  @ Fri, 03 Jan 2025 16:12:58: #1  Redundant rate of control: 0.27 
INFO  @ Fri, 03 Jan 2025 16:12:58: #1 finished! 
INFO  @ Fri, 03 Jan 2025 16:12:58: #2 Build Peak Model... 
INFO  @ Fri, 03 Jan 2025 16:12:58: #2 looking for paired plus/minus strand peaks... 
INFO  @ Fri, 03 Jan 2025 16:13:04: #2 number of paired peaks: 61929 
INFO  @ Fri, 03 Jan 2025 16:13:04: start model_add_line... 
INFO  @ Fri, 03 Jan 2025 16:13:05: start X-correlation... 
INFO  @ Fri, 03 Jan 2025 16:13:05: end of X-cor 
INFO  @ Fri, 03 Jan 2025 16:13:05: #2 finished! 
INFO  @ Fri, 03 Jan 2025 16:13:05: #2 predicted fragment length is 282 bps 
INFO  @ Fri, 03 Jan 2025 16:13:05: #2 alternative fragment length(s) may be 282 bps 
INFO  @ Fri, 03 Jan 2025 16:13:05: #2.2 Generate R script for model : data/peaks/ezh2_old_model.r 
INFO  @ Fri, 03 Jan 2025 16:13:05: #3 Call peaks... 
INFO  @ Fri, 03 Jan 2025 16:13:05: #3 Pre-compute pvalue-qvalue table... 
INFO  @ Fri, 03 Jan 2025 16:15:35: #3 Call peaks for each chromosome... 
INFO  @ Fri, 03 Jan 2025 16:16:54: #4 Write output xls file... data/peaks/ezh2_old_peaks.xls 
INFO  @ Fri, 03 Jan 2025 16:16:54: #4 Write peak in narrowPeak format file... data/peaks/ezh2_old_peaks.narrowPeak 
INFO  @ Fri, 03 Jan 2025 16:16:54: #4 Write summits bed file... data/peaks/ezh2_old_summits.bed 
INFO  @ Fri, 03 Jan 2025 16:16:54: Done! 

real	8m16.543s
user	8m13.596s
sys	0m9.816s


Rscript data/peaks/ezh2_old_model.r

null device 
          1


wc -l data/peaks/ezh2_old_peaks.narrowPeak

12344 data/peaks/ezh2_old_peaks.narrowPeak


cat data/peaks/ezh2_old_peaks.narrowPeak | cut -f 1-3 > data/peaks/ezh2_old_peaks.bed

time computeMatrix reference-point \
    --scoreFileName data/bw/SRR27667476.bw \
    --regionsFileName data/peaks/ezh2_old_peaks.bed \
    --referencePoint center \
    --upstream 3000 \
    --downstream 3000 \
    --binSize 5 \
    --outFileName data/peaks/ezh2_old_peaks.tab.gz

real	13m30.463s
user	13m32.118s
sys	0m5.064s


plotHeatmap \
    --matrixFile data/peaks/ezh2_old_peaks.tab.gz \
    --outFileName figures/ezh2_old_peaks.png \
    --heatmapHeight 15 \
    --heatmapWidth 6 \
    --refPointLabel "peak center" \
    --regionsLabel peaks


if [ ! -f data/peaks/ezh2_old_downloaded.narrowPeak ]; then
  ln -s \
      /mnt/storage/r0978323/project_data/prc2_age_index/downloaded/GSE253773_O1P2_peaks.narrowPeak \
      ./data/peaks/ezh2_old_downloaded.narrowPeak
fi

if [ ! -f data/peaks/ezh2_neonatal_downloaded.narrowPeak ]; then
  ln -s \
      /mnt/storage/r0978323/project_data/prc2_age_index/downloaded/GSE253773_N1P2_peaks.narrowPeak \
      ./data/peaks/ezh2_neonatal_downloaded.narrowPeak
fi

ls data/peaks/*.narrowPeak

data/peaks/ezh2_neonatal_downloaded.narrowPeak
data/peaks/ezh2_old_downloaded.narrowPeak
data/peaks/ezh2_old_peaks.narrowPeak


wc -l data/peaks/*.narrowPeak

  61493 data/peaks/ezh2_neonatal_downloaded.narrowPeak
  25139 data/peaks/ezh2_old_downloaded.narrowPeak
  12344 data/peaks/ezh2_old_peaks.narrowPeak
  98976 total


bedtools getfasta \
  -fi "$HG38_FASTA" \
  -bed data/peaks/ezh2_old_peaks.bed \
  -fo data/peaks/ezh2_old_peaks.fasta

head -5 data/peaks/ezh2_old_peaks.fasta

>NC_000001.11:48306-48588
AGAATTAAAGGGGTAATTATCAGAATGAAAATGGTTTAATGAAACTGTGTCTATCAGTTCTGAAAAGGGCCTCTATCACAATGAACTAAGGTAGTTATGAATAGAGCTAAAACTTAGGCAacaccatcctggacataggaacgggcaaagatttcatgacaaagacacggaaaccaatcacaacaaaagcaaaaattgagaagTGGAATCTAATAAAacaatagcttctgcacagcaaaagaagctaccaacaaagtaaacagacaacctacagaatgggag
>NC_000001.11:48797-49159
gagatatcatctcataccaggcagaatggctattattaaaaagtcaaaaataacagatatcggtgaggttacagagaaaagggaacacttatacactgttggtgggactgtaaattatttcaaccattgtggaaagcagtatgggatggcgattcctcaaaaagccaaaaacagaactatcattcaacccagcaattccattactgggtatatacccagaagaatataaatcgttctaccataaagacgcatgcatgagaatgttcattgcagcactactcacaatagcagagacatggaatcaacttaaatgcccatcagtaacagactggataaagaaagtgtggtacagatacaccgtg
>NC_000001.11:108980-109667


sed 's/^NC_0\+\([0-9]\+\)\.[0-9]\+\t/chr\1\t/' data/peaks/ezh2_old_peaks.bed \
  | sed 's/^chr23/chrX/' \
  | sed 's/^chr24/chrY/' \
  | grep -v '^N' \
  > data/peaks/ezh2_old_peaks.chr.bed


bp_tsv="/mnt/storage/r0978323/project_data/great/biological_processes.tsv"

echo "> All found biological processes:"
cat "$bp_tsv" | grep -v '^#' | wc -l

echo "> Development-related:"
cat "$bp_tsv" | grep -e '\(development\|differentiation\|morphogenesis\)' | wc -l

> All found biological processes:
500
> Development-related:
147


region_to_gene="/mnt/storage/r0978323/project_data/great/region_to_gene.txt"
gene_to_region="/mnt/storage/r0978323/project_data/great/gene_to_region.txt"

echo "Total regions:"
wc -l "$region_to_gene"
echo "Total genes:"
wc -l "$gene_to_region"
echo

echo "HOX genes:"
grep "^HOX" "$gene_to_region" | cut -f1 | tr '\n' ',' | sed 's/,/, /g' | sed 's/, $//'
echo
echo
echo "Pair-rule genes:"
grep "^PAX" "$gene_to_region" | cut -f1 | tr '\n' ',' | sed 's/,/, /g' | sed 's/, $//'
echo
echo
echo "Cyclin-dependent kinases:"
grep "^CDK" "$gene_to_region" | cut -f1 | tr '\n' ',' | sed 's/,/, /g' | sed 's/, $//'

Total regions:
11463 /mnt/storage/r0978323/project_data/great/region_to_gene.txt
Total genes:
1914 /mnt/storage/r0978323/project_data/great/gene_to_region.txt

HOX genes:
HOXA13, HOXB13, HOXB9, HOXC12, HOXC13, HOXD1, HOXD10, HOXD11, HOXD12, HOXD13, HOXD3, HOXD8, HOXD9

Pair-rule genes:
PAX2, PAX3, PAX4, PAX5, PAX6, PAX9

Cyclin-dependent kinases:
CDK18, CDK4, CDKN2A


cat data/peaks/ezh2_old_downloaded.narrowPeak | cut -f 1-3 > data/peaks/ezh2_old_downloaded.bed


region_to_gene="/mnt/storage/r0978323/project_data/great/region_to_gene_downloaded.txt"
gene_to_region="/mnt/storage/r0978323/project_data/great/gene_to_region_downloaded.txt"

echo "Total regions:"
wc -l "$region_to_gene"
echo "Total genes:"
wc -l "$gene_to_region"
echo

echo "HOX genes:"
grep "^HOX" "$gene_to_region" | cut -f1 | tr '\n' ',' | sed 's/,/, /g' | sed 's/, $//'
echo
echo
echo "Pair-rule genes:"
grep "^PAX" "$gene_to_region" | cut -f1 | tr '\n' ',' | sed 's/,/, /g' | sed 's/, $//'
echo
echo
echo "Cyclin-dependent kinases:"
grep "^CDK" "$gene_to_region" | cut -f1 | tr '\n' ',' | sed 's/,/, /g' | sed 's/, $//'

Total regions:
25141 /mnt/storage/r0978323/project_data/great/region_to_gene_downloaded.txt
Total genes:
2805 /mnt/storage/r0978323/project_data/great/gene_to_region_downloaded.txt

HOX genes:
HOXA11, HOXA13, HOXB9, HOXC11, HOXC13, HOXD1, HOXD10, HOXD11, HOXD12, HOXD13, HOXD3, HOXD8, HOXD9

Pair-rule genes:
PAX1, PAX2, PAX3, PAX4, PAX6, PAX9

Cyclin-dependent kinases:
CDK18, CDK4, CDKN2A

Sample	SRA id	SRA run id	URL
Old replicate 1	SRX23335288	SRR27667476	https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM8027713
Old replicate 1, input	SRX23335283	SRR27667481	https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM8027708

PRC2 Targets in CpG Islands¶

Table of contents:¶

Technical preparation¶

Sequence download¶

Quality control¶

Alignment¶

Coverage¶

Peak calling¶

Peak comparison¶

Motif discovery¶

Gene discovery¶

Closing thoughts¶