Tools and Scripting

Available tools to process data


Find data

grep - print lines matching a pattern

#!/bin/bash
 
 
grep regexp file     #    Retrieve all line matching regexp
grep -v regexp file  #    Retrieve all line unless it matches regexp
grep -o regexp file  #    Retrieve text matching regexp
grep -i regexp file  #    ignore case
grep -w regexp file  #    Retrieve a word
 
#usefull option : -C nb_ligne : diplay nb_ligne before and after each matching item.
#usefull option : -A nb_ligne : diplay nb_ligne after each matching item.
#usefull option : -B nb_ligne : diplay nb_ligne before each matching item.
#usefull option : -R recursive in all sub directories.	
 
 
grep -n regexp file  #    Add line number
grep -c regexp file  #    Only count
 
grep -l regexp *.txt #    print only file name
 
#  method 1:  combined with find
find . -name "*txt" -exec grep regexp {} + 
#  method 2:  combined with find
for F in `find . -name "*txt"`;do grep regexp $F ;done
 
 
# egrep is the same as grep -E
 
#ip adresse xxx.xxx.xxx.xxx
egrep "([0-9]{1,3}\.){3}[0-9]{1,3}"
 
#mac adresse xx:xx:xx:xx:xx:xx
egrep "([0-9a-fA-F]{2}\:){5}[0-9a-fA-F]{2}"
 
 
 
 
 

in2csv , csvcut , csvgrep , csvjson, sql2csv - best CSV ( Comma Seperated Values) tools ever to process data - csvkit

#Convert Excel to CSV:
in2csv data.xls > data.csv
#Convert JSON to CSV:
in2csv data.json > data.csv
#Convert to JSON:
csvjson data.csv > data.json
 
#Query with SQL:
csvsql --query "select name from data where age > 30" data.csv > new.csv
 
 
 

xmllint - the xml tool

Suppose one have the following xml data.

<Data>
    <Measurements>
      <Time>2018-06-19T08:45:00</Time>
      <Temp>12.8</Temp>
    </Measurements>
    <Measurements>
      <Time>2018-06-19T09:00:00</Time>
      <Temp>11.8</Temp>
    </Measurements>
    <Measurements>
      <Time>2018-06-19T09:15:00</Time>
      <Temp>14.8</Temp>
    </Measurements>
</Data>
 

It can easily be accessed through xmlint and the --xpath option. rpath syntax is define as standard

#validation of xml versus its xsd
xmllint --noout --schema file.xsd file.xml
 
#Extract data in the file
xmllint --xpath "//Data/Measurements[last()]/Temp/text()" data.xml
xmllint --xpath "count(//Data/Measurements)"
 
#Compute mean
xmllint --xpath "sum(//Data/Measurements/Temp) div count(//Data/Measurements)"
 
 
 
#Download the file , and then extract the data
curl -s "https://franckbehaghel.eu/programming/bash/data.xml" |  xmllint --xpath "//Data/Measurements[last()]/Temp/text()" -
 
Manipulating data tools

sed - stream editor

#!/bin/bash
 
#print command
 
sed -n 'p' file # print file
 
sed -n '1 p' file # print first line of file
 
sed -n '3 p' file # print 3th line of file
 
sed -n 'a,b p' file # print from a to b line of file	
 
seq 1 100 | sed -n  '4 p; 7 p' 
 
# return :
#4
#7
 
 
seq 1 100 | sed -n  '4,7 p'
 
# return :
#4
#5
#6
#7
 
 # start~step address
seq 1 100 | sed -n 1~2p   # only odd line
 
 
 # n next line
echo -e "AAA\nBBB\nCCC\nDDD" | sed -n '/BBB/ {n;p}'  
# return :
# CCC
 
#command substitution 
echo -e "AAA"  | sed 's/AAA/BBB/g'     # return BBB           
 
# reducing number
echo "-1.23343230000000000000" | sed 's#\.\([0-9]\{5\}\)\([0-9]*\)#.\1#'
 
 
#I case-Insensitive 
echo -e "AAA"   sed 's/AaA/BBB/I'      # return BBB 
 
 
 # sequence of command : next line; substitution ; print
echo -e "AAA\nBBB\nCCCC\nDDD" | sed -n '/BBB/ {n;s/C/Z/1p}'   # return ZCCC
echo -e "AAA\nBBB\nCCCC\nDDD" | sed -n '/BBB/ {n;s/C/Z/2p}'   # return CZCC
echo -e "AAA\nBBB\nCCCC\nDDD" | sed -n '/BBB/ {n;s/C/Z/3p}'   # return CCZC
echo -e "AAA\nBBB\nCCCC\nDDD" | sed -n '/BBB/ {n;s/C/Z/4p}'   # return CCCZ
echo -e "AAA\nBBB\nCCCC\nDDD" | sed -n '/BBB/ {n;s/C/Z/gp}'   # return ZZZZ
 
 
 # command evaluate
echo -e "AAA\nBBB\nCCC\nDDD" | sed 's/BBB/echo ZZZ/e'
 
 # command hold
 #  set line 1 in mem
 #  insert line 1 in line5
 
seq 1 10 | sed -n '1 h; 1 d; 5 {G;x;P;x};P'
 
 
 # swap line
echo -e "first:second\nOne:Two\n" | sed  's/\(.*\):\(.*\)/\2:\1/'
 
 
 
sed '/^#/d' file                 # remove comment
 
sed -e '/^$/d;/^#$/d'            # remove empty line and comment
 
echo 'é' | sed 'y/éèê/eee/'      #translation
 
 
 

sort - sort lines of text files

#!/bin/bash
cat test1.csv | sort
 
cat test1.csv | sort -r # usefull option : -r  reverse the result of comparisons
 

awk - pattern scanning and processing language

cat test.csv
#1 1
#2 4
#3 4
#4 4
#5 7
#6 3
#7 2
 
awk -e 'BEGIN { print "START" } { print    } END   { print "STOP"  }' test.csv
#START
#1 1
#2 4
#3 4
#4 4
#5 7
#6 3
#7 2
#STOP
 
awk -e 'BEGIN { print "START" } { add=$1+$2; print add   } END   { print "STOP"  }' test.csv
#START
#2
#6
#7
#8
#12
#9
#9
#STOP
 
awk -e 'BEGIN { print "START" } { add=sin($1+$2); printf ("sinus(%d+%d)=  %f\n",$1,$2 ,add)   } END   { print "STOP"  }' test.csv
#START
#sinus(1+1)=  0.909297
#sinus(2+4)=  -0.279415
#sinus(3+4)=  0.656987
#sinus(4+4)=  0.989358
#sinus(5+7)=  -0.536573
#sinus(6+3)=  0.412118
#sinus(7+2)=  0.412118
#STOPT
 
#Swapping columns
awk -e '{ printf("%s  %s \n",$1,$2)}'
#Adding columns
seq 1 5 | awk '{s+=$1} END{print "Sum: "s, "\nNumber of lines: "NR, "\nAverage: "s/(NR)}'
 
#Sum: 15 
#Number of lines: 5 
#Average: 3
 

bc - An arbitrary precision calculator language

#!/bin/bash
 
echo "sqrt(2)" | bc -l
#1.41421356237309504880
 
echo "scale=100; 4*a(1)" | bc -l
#3.141592653589793238462643383279502884197169399375105820974944592307\                                                                                                                                   
#8164062862089986280348253421170676                     

cut - remove sections from each line of files

#!/bin/bash
 
 
cat file.csv | cut -d';' -f5
 

iconv - codeset conversion

iconv
iconv -f iso-8859-1 -t utf-8 <infile >outfile

dos2unix - DOS/Mac to Unix and vice versa text file format converter

dos2unix
dos2unix -k file.txt

Generating data tools

seq - print a sequence of numbers

seq
seq 1 5


Format data tools

printf - format and print data - c programmer may appreciate to easily format data

printf
printf "%0.5d\n" 1   return "00001

Database tools

uniq - report or omit repeated lines
join - join lines of two files on a common field
comm - compare two sorted files line by line

comm
comm --nocheck-order -12 test.csv test1.csv

Looging tools

tee - read from standard input and write to standard output and files

tee
program_generating_many_log | tee logfile.log


File tools

tail - output the last part of files
head - output the first part of files
wc - print newline, word, and byte counts for each file



Display data

gnuplot - an interactive plotting program

gnuplot
gnuplot -p -e 'plot "test.csv" using 1:2'




Tools to go deeper in scripting


Specific usage ...
may be adapted for specfic usage. Scripting is described as slow. Making your tools can be more efficient.


Tool 1 : common Find common value from 2 differents files
Tool 2 : different Find different value from 2 differents files
Tool 3 : byte_at Retrieve 1 byte from a file at specified address
Tool 4 : statistic Compute numerical statistic from text file



Makefile: Makefile to build those programmes