From: | Nipuna <nipunajoset(at)gmail(dot)com> |
---|---|
To: | pgsql-novice(at)postgresql(dot)org |
Subject: | Finding the no of files and directories in a path |
Date: | 2013-02-07 17:39:59 |
Message-ID: | CAPCz1_0JYiqSbO9CCpuqZsAMPq4LH9C+Mw6nQguoRGZQHE+eAg@mail.gmail.com |
Views: | Raw Message | Whole Thread | Download mbox | Resend email |
Thread: | |
Lists: | pgsql-novice |
Hi,
I have a table with 45 million records.I have 3 columns say, file_size,
kb_alloc, misc_attr
*The misc_attr column is the path*. The sample record is given below
"57971"; "65536"; "FAu --
/gpfs/scratch05/data/lm/build_weekly/official/mobi/eng-USA/messaging/large/2011_08_21/clusters/models/field/lexfiles/respell.map.spelling"
"851"; "8192"; "FAu --
/gpfs/scratch05/data/lm/build_weekly/official/mobi/eng-USA/messaging/large/2011_08_21/clusters/models/field/lexfiles/rewriteForm.map"
"8192" "16384"; "D2u --
/gpfs/scratch05/data/lm/scratch/alfred_dielmann/NVI/PT/MP14k/TEST_SAMPLE_SUPPRESSED_DIGITS_datapack/2p/outputlnk/recog/txt/58ABB8B7-C44C11DF-A73A949F-7B4DBCC0_CALLER"
"8192" "16384" "D2u --
/gpfs/scratch05/data/lm/scratch/alfred_dielmann/NVI/PT/MP14k/TEST_SAMPLE_SUPPRESSED_DIGITS_datapack/2p/outputlnk/recog/log/58ABB8B7-C44C11DF-A73A949F-7B4DBCC0_CALLER"
"8192" "16384" "D2u --
/gpfs/scratch05/data/lm/scratch/alfred_dielmann/NVI/PT/MP14k/TEST_SAMPLE_SUPPRESSED_DIGITS_datapack/2p/outputlnk/recog/time/58ABB8B7-C44C11DF-A73A949F-7B4DBCC0_CALLER"
"4487"; "8192"; "FAu --
/gpfs/scratch05/data/lm/scratch/yi_su/numbered-list/data.old/gpfs/static02/data/lm/data/english/text/scansoft/english-us/medical/transcend/90/90.47915.2000.UNKNOWN.txt.in.gz"
"8192"; "16384"; "D2u --
/gpfs/scratch05/data/lm/scratch/alfred_dielmann/DNS_VOC/legal/eng-GBR/BUILDS/BASELINE/lm/build/vocabs/natspeak/eng-GBR/legal/large/2012_04_20/plato/eng/build-bin-12.00.300"
"8192" "16384" "D2u --
/gpfs/scratch05/data/lm/scratch/alfred_dielmann/DNS_VOC/legal/eng-GBR/BUILDS/BASELINE/lm/build/vocabs/natspeak/eng-GBR/legal/large/2012_04_20/plato/eng/build-bin-13.00.100"
"7" "8192" "FAu --
/gpfs/scratch05/data/lm/scratch/sophie_charignon/plato/jpn/.hg/undo.branch"
"8192" "16384" "D2u --
/gpfs/scratch05/data/lm/scratch/alfred_dielmann/DNS_VOC/legal/eng-GBR/BUILDS/BASELINE/lm/build/vocabs/natspeak/eng-GBR/legal/large/2012_04_20/plato/eng/build-bin-13.02.100"
I want a table which gives me the directory count and file count and their
file size.My table should look like this;
*dir_path
kb_alloc USED_SIZE folder_count file_count
*"/gpfs/scratch05/data/lm/build_weekly/vocabs";
49152; 37249; 9; 1
"/gpfs/scratch05/data/lm/build_weekly/vocabs/natspeak";
49152; 37249; 8; 1
"/gpfs/scratch05/data/lm/build_weekly/vocabs/natspeak/eng-GBR";
49152; 37249; 7; 1
"/gpfs/scratch05/data/lm/build_weekly/vocabs/natspeak/eng-GBR/legal";
49152; 37249; 6; 1
"/gpfs/scratch05/data/lm/scratch/erik_larsson";
2093824; 21783001; 59; 25
"/gpfs/scratch05/data/lm/scratch/erik_larsson/zep";
21651456; 21620619; 30; 7
"/gpfs/scratch05/data/lm/scratch/erik_larsson/zep/builds";
21602304 ; 1572331; 22; 6
"/gpfs/scratch05/data/lm/scratch/erik_larsson/zep/builds/lm";
21602304; 21572331; 21; 6
I have written a function to create the table in this format. But a per
the calculation it might the take 600+ hours to return the result. Is there
any way I could optimize the function I have written. Below is the function
I have written;
-- Function: hierarchial_file_structure()
-- DROP FUNCTION hierarchial_file_structure();
CREATE OR REPLACE FUNCTION hierarchial_file_structure()
RETURNS void AS
$BODY$
DECLARE
file_size1 BIGINT;
folder_count1 integer;
kb_alloc1 BIGINT;
misc_attr1 TEXT;
folder_names TEXT ARRAY;
path_names TEXT ARRAY;
current_path TEXT;
isNewRow BOOLEAN;
mFILES RECORD;
DataType TEXT;
temp_count BIGINT;
sqlcount bigint;
BEGIN
DELETE FROM file_structure;
sqlcount :=0;
FOR mFILES IN SELECT CAST(coalesce(file_size, '0') AS bigint) as
file_size, CAST(coalesce(kb_alloc, '0') AS bigint) as kb_alloc, misc_attr
FROM storage where (substring(misc_attr,1,1)) in ('F','D')
LOOP
file_size1 := mFILES.file_size;
kb_alloc1 := mFILES.kb_alloc;
misc_attr1 := mFILES.misc_attr;
DataType :=(substring(misc_attr1,1,1));
folder_names := regexp_split_to_array(misc_attr1, '/');
current_path := '';
-- LOOP THROUGH FOLDER NAMES AND CREATE THE PATH NAMES
IF DataType = 'F' THEN
-- IGNORE THE LAST ELEMENT SINCE ITS THE FILE NAME
FOR iFolderArrayCount IN 2..(array_length(folder_names,1)-1)
LOOP
current_path := current_path || '/' ||
folder_names[iFolderArrayCount];
path_names[iFolderArrayCount-1] := current_path;
END LOOP;
ELSE
FOR iFolderArrayCount IN 2..(array_length(folder_names,1))
LOOP
current_path := current_path || '/' ||
folder_names[iFolderArrayCount];
path_names[iFolderArrayCount-1] := current_path;
END LOOP;
END IF;
--LOOP THROUH THE PATH NAMES
FOR iPathArrayCount IN 1..(array_length(path_names,1))
LOOP
temp_count := array_length(path_names,1) - iPathArrayCount;
select into folder_count1 count(*) from file_structure where
dir_path = ANY(path_names[iPathArrayCount+1:array_length(path_names,1)]);
folder_count1 := temp_count - folder_count1;
current_path := path_names[iPathArrayCount];
IF EXISTS(select distinct(dir_path) from file_structure where
dir_path = current_path) THEN
--Row Exists hence update the count using the count of
folders that does not exist in the summary table
IF(DataType = 'F') THEN
update file_structure set
kb_alloc = kb_alloc + kb_alloc1,
used_size = used_size+ file_size1,
file_count = file_count +1,
folder_count = folder_count+ folder_count1
where dir_path = current_path;
ELSE
update file_structure set
kb_alloc = kb_alloc + kb_alloc1,
used_size = used_size+ file_size1,
folder_count = folder_count+folder_count1
where dir_path = current_path;
END IF;
ELSE
IF(DataType = 'F') THEN
INSERT INTO file_structure
(dir_path,kb_alloc,used_size, folder_count,file_count) VALUES
(current_path,kb_alloc1,
file_size1,folder_count1,1);
ELSE
INSERT INTO file_structure
(dir_path,kb_alloc,used_size, folder_count,file_count) VALUES
(current_path,kb_alloc1,
file_size1,folder_count1,0);
END IF;
END IF;
END LOOP;
END LOOP;
END;
$BODY$
LANGUAGE plpgsql VOLATILE
COST 100;
ALTER FUNCTION hierarchial_file_structure()
OWNER TO postgres;
Please let me know if there is a better way to write this function.Thanks
in advance
--
Regards,
Nipuna
From | Date | Subject | |
---|---|---|---|
Next Message | Tom Lane | 2013-02-07 17:51:15 | Re: Re: [NOVICE] Re: [NOVICE] Problems with ñ and tildes / CSV import problems in PostgreSQL 9.1 |
Previous Message | Zach Seaman | 2013-02-07 17:05:19 | Re: [NOVICE] Re: [NOVICE] Problems with ñ and tildes / CSV import problems in PostgreSQL 9.1 |