Finding the no of files and directories in a path

From: Nipuna <nipunajoset(at)gmail(dot)com>
To: pgsql-novice(at)postgresql(dot)org
Subject: Finding the no of files and directories in a path
Date: 2013-02-07 17:39:59
Message-ID: CAPCz1_0JYiqSbO9CCpuqZsAMPq4LH9C+Mw6nQguoRGZQHE+eAg@mail.gmail.com
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-novice

Hi,

I have a table with 45 million records.I have 3 columns say, file_size,
kb_alloc, misc_attr

*The misc_attr column is the path*. The sample record is given below

"57971"; "65536"; "FAu --
/gpfs/scratch05/data/lm/build_weekly/official/mobi/eng-USA/messaging/large/2011_08_21/clusters/models/field/lexfiles/respell.map.spelling"
"851"; "8192"; "FAu --
/gpfs/scratch05/data/lm/build_weekly/official/mobi/eng-USA/messaging/large/2011_08_21/clusters/models/field/lexfiles/rewriteForm.map"
"8192" "16384"; "D2u --
/gpfs/scratch05/data/lm/scratch/alfred_dielmann/NVI/PT/MP14k/TEST_SAMPLE_SUPPRESSED_DIGITS_datapack/2p/outputlnk/recog/txt/58ABB8B7-C44C11DF-A73A949F-7B4DBCC0_CALLER"
"8192" "16384" "D2u --
/gpfs/scratch05/data/lm/scratch/alfred_dielmann/NVI/PT/MP14k/TEST_SAMPLE_SUPPRESSED_DIGITS_datapack/2p/outputlnk/recog/log/58ABB8B7-C44C11DF-A73A949F-7B4DBCC0_CALLER"
"8192" "16384" "D2u --
/gpfs/scratch05/data/lm/scratch/alfred_dielmann/NVI/PT/MP14k/TEST_SAMPLE_SUPPRESSED_DIGITS_datapack/2p/outputlnk/recog/time/58ABB8B7-C44C11DF-A73A949F-7B4DBCC0_CALLER"
"4487"; "8192"; "FAu --
/gpfs/scratch05/data/lm/scratch/yi_su/numbered-list/data.old/gpfs/static02/data/lm/data/english/text/scansoft/english-us/medical/transcend/90/90.47915.2000.UNKNOWN.txt.in.gz"
"8192"; "16384"; "D2u --
/gpfs/scratch05/data/lm/scratch/alfred_dielmann/DNS_VOC/legal/eng-GBR/BUILDS/BASELINE/lm/build/vocabs/natspeak/eng-GBR/legal/large/2012_04_20/plato/eng/build-bin-12.00.300"
"8192" "16384" "D2u --
/gpfs/scratch05/data/lm/scratch/alfred_dielmann/DNS_VOC/legal/eng-GBR/BUILDS/BASELINE/lm/build/vocabs/natspeak/eng-GBR/legal/large/2012_04_20/plato/eng/build-bin-13.00.100"
"7" "8192" "FAu --
/gpfs/scratch05/data/lm/scratch/sophie_charignon/plato/jpn/.hg/undo.branch"
"8192" "16384" "D2u --
/gpfs/scratch05/data/lm/scratch/alfred_dielmann/DNS_VOC/legal/eng-GBR/BUILDS/BASELINE/lm/build/vocabs/natspeak/eng-GBR/legal/large/2012_04_20/plato/eng/build-bin-13.02.100"

I want a table which gives me the directory count and file count and their
file size.My table should look like this;

*dir_path
kb_alloc USED_SIZE folder_count file_count
*"/gpfs/scratch05/data/lm/build_weekly/vocabs";
49152; 37249; 9; 1
"/gpfs/scratch05/data/lm/build_weekly/vocabs/natspeak";
49152; 37249; 8; 1
"/gpfs/scratch05/data/lm/build_weekly/vocabs/natspeak/eng-GBR";
49152; 37249; 7; 1
"/gpfs/scratch05/data/lm/build_weekly/vocabs/natspeak/eng-GBR/legal";
49152; 37249; 6; 1
"/gpfs/scratch05/data/lm/scratch/erik_larsson";
2093824; 21783001; 59; 25
"/gpfs/scratch05/data/lm/scratch/erik_larsson/zep";
21651456; 21620619; 30; 7
"/gpfs/scratch05/data/lm/scratch/erik_larsson/zep/builds";
21602304 ; 1572331; 22; 6
"/gpfs/scratch05/data/lm/scratch/erik_larsson/zep/builds/lm";
21602304; 21572331; 21; 6

I have written a function to create the table in this format. But a per
the calculation it might the take 600+ hours to return the result. Is there
any way I could optimize the function I have written. Below is the function
I have written;

-- Function: hierarchial_file_structure()

-- DROP FUNCTION hierarchial_file_structure();

CREATE OR REPLACE FUNCTION hierarchial_file_structure()
RETURNS void AS
$BODY$

DECLARE

file_size1 BIGINT;
folder_count1 integer;
kb_alloc1 BIGINT;
misc_attr1 TEXT;
folder_names TEXT ARRAY;
path_names TEXT ARRAY;
current_path TEXT;
isNewRow BOOLEAN;
mFILES RECORD;
DataType TEXT;
temp_count BIGINT;
sqlcount bigint;

BEGIN

DELETE FROM file_structure;

sqlcount :=0;

FOR mFILES IN SELECT CAST(coalesce(file_size, '0') AS bigint) as
file_size, CAST(coalesce(kb_alloc, '0') AS bigint) as kb_alloc, misc_attr
FROM storage where (substring(misc_attr,1,1)) in ('F','D')
LOOP

file_size1 := mFILES.file_size;
kb_alloc1 := mFILES.kb_alloc;
misc_attr1 := mFILES.misc_attr;

DataType :=(substring(misc_attr1,1,1));
folder_names := regexp_split_to_array(misc_attr1, '/');
current_path := '';

-- LOOP THROUGH FOLDER NAMES AND CREATE THE PATH NAMES
IF DataType = 'F' THEN
-- IGNORE THE LAST ELEMENT SINCE ITS THE FILE NAME

FOR iFolderArrayCount IN 2..(array_length(folder_names,1)-1)
LOOP

current_path := current_path || '/' ||
folder_names[iFolderArrayCount];

path_names[iFolderArrayCount-1] := current_path;

END LOOP;
ELSE

FOR iFolderArrayCount IN 2..(array_length(folder_names,1))
LOOP

current_path := current_path || '/' ||
folder_names[iFolderArrayCount];
path_names[iFolderArrayCount-1] := current_path;

END LOOP;
END IF;

--LOOP THROUH THE PATH NAMES

FOR iPathArrayCount IN 1..(array_length(path_names,1))
LOOP

temp_count := array_length(path_names,1) - iPathArrayCount;

select into folder_count1 count(*) from file_structure where
dir_path = ANY(path_names[iPathArrayCount+1:array_length(path_names,1)]);

folder_count1 := temp_count - folder_count1;
current_path := path_names[iPathArrayCount];

IF EXISTS(select distinct(dir_path) from file_structure where
dir_path = current_path) THEN

--Row Exists hence update the count using the count of
folders that does not exist in the summary table

IF(DataType = 'F') THEN

update file_structure set
kb_alloc = kb_alloc + kb_alloc1,
used_size = used_size+ file_size1,
file_count = file_count +1,
folder_count = folder_count+ folder_count1
where dir_path = current_path;

ELSE
update file_structure set
kb_alloc = kb_alloc + kb_alloc1,
used_size = used_size+ file_size1,
folder_count = folder_count+folder_count1
where dir_path = current_path;

END IF;
ELSE

IF(DataType = 'F') THEN

INSERT INTO file_structure
(dir_path,kb_alloc,used_size, folder_count,file_count) VALUES
(current_path,kb_alloc1,
file_size1,folder_count1,1);
ELSE
INSERT INTO file_structure
(dir_path,kb_alloc,used_size, folder_count,file_count) VALUES
(current_path,kb_alloc1,
file_size1,folder_count1,0);
END IF;

END IF;
END LOOP;
END LOOP;

END;

$BODY$
LANGUAGE plpgsql VOLATILE
COST 100;
ALTER FUNCTION hierarchial_file_structure()
OWNER TO postgres;

Please let me know if there is a better way to write this function.Thanks
in advance

--
Regards,
Nipuna

Browse pgsql-novice by date

  From Date Subject
Next Message Tom Lane 2013-02-07 17:51:15 Re: Re: [NOVICE] Re: [NOVICE] Problems with ñ and tildes / CSV import problems in PostgreSQL 9.1
Previous Message Zach Seaman 2013-02-07 17:05:19 Re: [NOVICE] Re: [NOVICE] Problems with ñ and tildes / CSV import problems in PostgreSQL 9.1